func-regexp.scm 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962
  1. ; Copyright (c) 1993-2007 by Richard Kelsey and Jonathan Rees. See file COPYING.
  2. ; All regular expressions are records of the following type.
  3. ;
  4. ; type - a symbol indicating which type of regular expression this is.
  5. ; compiled - cache for the compiled Posix regular expression; initially #F
  6. ; field0 - data fields containing
  7. ; field1 - type-specific
  8. ; field2 - information
  9. ;
  10. ; Regular expressions are not modified, except for adding cached Posix values.
  11. ;
  12. ; The different types of regular expressions are:
  13. ; Set
  14. ; use-case - character bitmask for case-sensitive matching
  15. ; no-case - character bitmask for case-insensitive matching
  16. ; string - cached Posix regular expression for this set
  17. ; String-Start
  18. ; String-End
  19. ; Submatch
  20. ; subexp - the subexpression whose match should be reported
  21. ; id - EQ? tag used to identify the substring that matched
  22. ; Sequence
  23. ; subexps - list subexpressions to be matched in order
  24. ; One-Of
  25. ; subexps - list of subexpressions, any one of which is to match
  26. ; Repeat
  27. ; low - minimum count
  28. ; high - maximum count, or #f if unbounded
  29. ; subexp - subexpression to be repeated
  30. (define-record-type regexp :regexp
  31. (make-regexp type compiled field0 field1 field2)
  32. regexp?
  33. (type regexp-type)
  34. (compiled regexp-compiled set-regexp-compiled!)
  35. (field0 regexp-field0)
  36. (field1 regexp-field1)
  37. (field2 regexp-field2 set-regexp-field2!))
  38. (define-record-discloser :regexp
  39. (lambda (regexp)
  40. (list 'regexp (regexp-type regexp))))
  41. ; A goofy macro to make the expression-type definitions more readable.
  42. (define-syntax define-regexp-type
  43. (syntax-rules ()
  44. ((define-regexp-type name maker predicate)
  45. (begin
  46. (define (maker) (make-regexp 'name #f #f #f #f))
  47. (define (predicate regexp)
  48. (eq? (regexp-type regexp) 'name))))
  49. ((define-regexp-type name maker predicate slot0)
  50. (begin
  51. (define (maker x)
  52. (make-regexp 'name #f x #f #f))
  53. (define (predicate regexp)
  54. (eq? (regexp-type regexp) 'name))
  55. (define slot0 regexp-field0)))
  56. ((define-regexp-type name maker predicate slot0 slot1)
  57. (begin
  58. (define (maker x y) (make-regexp 'name #f x y #f))
  59. (define (predicate regexp) (eq? (regexp-type regexp) 'name))
  60. (define slot0 regexp-field0)
  61. (define slot1 regexp-field1)))
  62. ((define-regexp-type name maker predicate slot0 slot1 slot2)
  63. (begin
  64. (define (maker x y z) (make-regexp 'name #f x y z))
  65. (define (predicate regexp) (eq? (regexp-type regexp) 'name))
  66. (define slot0 regexp-field0)
  67. (define slot1 regexp-field1)
  68. (define slot2 regexp-field2)))))
  69. ; Character sets
  70. ;
  71. ; Each has two bitsets, one which is the case-sensitive version of the set and
  72. ; the other the case-insensitive.
  73. ;
  74. ; There are two cached values, the POSIX regexp string for the set and the
  75. ; actual regexp object. The second is set only if this set is the entire
  76. ; expression being matched.
  77. (define-regexp-type set really-make-set set?
  78. set-use-case
  79. set-no-case
  80. set-string)
  81. (define set-set-string! set-regexp-field2!)
  82. (define (make-set case no-case)
  83. (really-make-set case no-case #f))
  84. (define the-empty-set
  85. (really-make-set 0 0 #f))
  86. (define (empty-set? set)
  87. (and (set? set)
  88. (= 0 (set-use-case set))
  89. (= 0 (set-no-case set))))
  90. (define (char->mask char)
  91. (arithmetic-shift 1 (char->scalar-value char)))
  92. (define (char-in-set? char set)
  93. (not (zero? (bitwise-and (set-use-case set)
  94. (char->mask char)))))
  95. (define char-limit 256) ; allow eight-bit characters
  96. ; A vector mapping Latin-1 values to case-insensitive bitsets.
  97. ; It's unclear how to make this work with Unicode while preserving
  98. ; the underlying 8-bit POSIX API.
  99. (define no-case-char-masks
  100. (reduce ((count* i 0 char-limit))
  101. ((masks '()))
  102. (cons (let ((ch (scalar-value->char i)))
  103. (bitwise-ior (arithmetic-shift 1 i)
  104. (cond ((and (< i 128)
  105. (char-upper-case? ch))
  106. (char->mask (char-downcase ch)))
  107. ((and (< i 128)
  108. (char-lower-case? ch))
  109. (char->mask (char-upcase ch)))
  110. (else
  111. 0))))
  112. masks)
  113. (list->vector (reverse masks))))
  114. ; A vector of the single-character sets.
  115. (define singleton-sets
  116. (reduce ((count* i 0 char-limit)) ; allow eight-bit characters
  117. ((sets '()))
  118. (cons (make-set (arithmetic-shift 1 i)
  119. (vector-ref no-case-char-masks i))
  120. sets)
  121. (list->vector (reverse sets))))
  122. (define (char->set char)
  123. (vector-ref singleton-sets (char->scalar-value char)))
  124. ; Arguments can be strings or single characters. We walk down all of the
  125. ; characters, or-ing their masks together.
  126. (define (set . all-args)
  127. (if (and (= 1 (length all-args))
  128. (char? (car all-args)))
  129. (char->set (car all-args))
  130. (reduce ((list* arg all-args))
  131. ((case 0)
  132. (no-case 0))
  133. (cond ((char? arg)
  134. (add-char-masks arg case no-case))
  135. ((string? arg)
  136. (add-string-masks arg case no-case))
  137. (else
  138. (apply call-error "invalid argument" set all-args)))
  139. (make-set case no-case))))
  140. (define (add-char-masks char case no-case)
  141. (values (bitwise-ior case (char->mask char))
  142. (bitwise-ior no-case
  143. (vector-ref no-case-char-masks
  144. (char->scalar-value char)))))
  145. (define (add-string-masks string case no-case)
  146. (reduce ((string* char string))
  147. ((case case)
  148. (no-case no-case))
  149. (add-char-masks char case no-case)))
  150. ; Ranges. Again, we loop through the ranges building up two masks.
  151. (define (range low high)
  152. (or (real-ranges `(,low ,high) char->integer integer->scalar-value)
  153. (call-error "invalid argument" range low high)))
  154. (define (ranges . limits)
  155. (or (real-ranges limits char->integer integer->scalar-value)
  156. (apply call-error "invalid argument" ranges limits)))
  157. (define (ascii-range low high)
  158. (or (real-ranges `(,low ,high) char->ascii identity)
  159. (call-error "invalid argument" ascii-range low high)))
  160. (define (ascii-ranges . limits)
  161. (or (real-ranges limits char->ascii identity)
  162. (apply call-error "invalid argument" ascii-ranges limits)))
  163. (define (integer->scalar-value i)
  164. (char->scalar-value (integer->char i)))
  165. (define (identity i)
  166. i)
  167. ; LIMITS is a list of lists (<start-char> <end-char>), CHAR->INT returns an
  168. ; integer given a character and INT->SCALAR-VALUE translates that integer to the
  169. ; corresponding scalar value.
  170. (define (real-ranges limits char->int int->scalar-value)
  171. (if (every char? limits)
  172. (let loop ((to-do limits) (case 0) (no-case 0))
  173. (cond ((null? to-do)
  174. (make-set case no-case))
  175. ((null? (cdr to-do))
  176. #f)
  177. (else
  178. (let ((start (char->int (car to-do)))
  179. (end (char->int (cadr to-do))))
  180. (if (< end start)
  181. #f
  182. (reduce ((count* i start (+ end 1)))
  183. ((case case)
  184. (no-case no-case))
  185. (let ((scalar-value (int->scalar-value i)))
  186. (values (bitwise-ior case
  187. (arithmetic-shift 1 scalar-value))
  188. (bitwise-ior no-case
  189. (vector-ref no-case-char-masks
  190. scalar-value))))
  191. (loop (cddr to-do) case no-case)))))))
  192. #f))
  193. ; The mask with all ones.
  194. (define all-chars (- (arithmetic-shift 1 char-limit) 1))
  195. (define all-chars-except-nul (- all-chars 1))
  196. ; Set operations.
  197. (define (negate set)
  198. (make-set (bitwise-xor all-chars-except-nul (set-use-case set))
  199. (bitwise-xor all-chars-except-nul (set-no-case set))))
  200. (define (set-binop op)
  201. (lambda (set1 set2)
  202. (make-set (op (set-use-case set1)
  203. (set-use-case set2))
  204. (op (set-no-case set1)
  205. (set-no-case set2)))))
  206. (define intersection (set-binop bitwise-and))
  207. (define union (set-binop bitwise-ior))
  208. (define subtract
  209. (set-binop (lambda (x y)
  210. (bitwise-xor x
  211. (bitwise-and x y)))))
  212. ; Predefined sets. These are from the LC_CTYPE category in the POSIX
  213. ; Locale, which is the nearest POSIX gets to defining character classes.
  214. (define lower-case (range #\a #\z))
  215. (define upper-case (range #\A #\Z))
  216. (define alphabetic (union lower-case upper-case))
  217. (define numeric (range #\0 #\9))
  218. (define alphanumeric (union alphabetic numeric))
  219. (define punctuation (set "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"))
  220. (define blank (set #\space (scalar-value->char 9))) ;#\tab
  221. (define graphic (union alphanumeric punctuation))
  222. (define whitespace (apply set (map scalar-value->char
  223. '(32 ;space
  224. 9 ;tab
  225. 10 ;newline
  226. 11 ;vertical tab
  227. 12 ;form feed
  228. 13)))) ;return
  229. (define printing (union graphic (set #\space)))
  230. (define control (negate printing))
  231. (define hexdigit (union numeric (ranges #\a #\f #\A #\F)))
  232. ;----------------
  233. ; String beginning and end anchors.
  234. (define-regexp-type string-start make-string-start string-start?)
  235. (define-regexp-type string-end make-string-end string-end?)
  236. (define the-string-start (make-string-start))
  237. (define the-string-end (make-string-end))
  238. (define (string-start)
  239. the-string-start)
  240. (define (string-end)
  241. the-string-end)
  242. ;----------------
  243. ; Expression types.
  244. ;
  245. ; There are four records for expressions: submatches, sequences, one-ofs,
  246. ; and repeats. The first and last have a single subexpression, the middle
  247. ; two each have a list.
  248. ;
  249. ; Each record contains a slot for caching a POSIX regexp object. This is
  250. ; set only if that record is the entire expression being matched.
  251. ; A submatch is just a marker wrapped around another expression, with a tag
  252. ; used to identify the matched substring.
  253. (define-regexp-type submatch make-submatch submatch?
  254. submatch-exp
  255. submatch-id)
  256. (define (submatch id exp)
  257. (cond ((not (regexp? exp))
  258. (call-error "invalid argument" submatch exp))
  259. ((empty-set? exp)
  260. the-empty-set)
  261. (else
  262. (make-submatch exp id))))
  263. ; Sequences.
  264. (define-regexp-type sequence make-sequence sequence?
  265. sequence-exps)
  266. ; Epsilon is encoded as an empty sequence.
  267. (define epsilon (make-sequence '()))
  268. (define (epsilon? x)
  269. (and (sequence? x)
  270. (null? (sequence-exps x))))
  271. ; We splice out any subordinate sequences. An empty-set means that the sequence
  272. ; itself can never match and is thus the empty-set as well.
  273. ;
  274. ; We can't use REDUCE because we add on to the EXPS list.
  275. (define (sequence . all-exps)
  276. (let loop ((exps all-exps) (res '()))
  277. (if (null? exps)
  278. (cond ((null? res)
  279. epsilon)
  280. ((null? (cdr res))
  281. (car res))
  282. (else
  283. (make-sequence (reverse res))))
  284. (let ((exp (car exps)))
  285. (cond ((not (regexp? exp))
  286. (apply call-error "invalid argument" sequence all-exps))
  287. ((empty-set? exp)
  288. the-empty-set)
  289. ((sequence? exp)
  290. (loop (append (sequence-exps exp) (cdr exps)) res))
  291. (else
  292. (loop (cdr exps) (cons exp res))))))))
  293. ; one-ofs
  294. (define-regexp-type one-of make-one-of one-of?
  295. one-of-exps)
  296. ; We splice out any subordinate one-ofs and merge all sets into a single set.
  297. ; An empty one-of is the empty set. Any subordinate empty-sets can be dropped.
  298. (define (one-of . all-exps)
  299. (let loop ((exps all-exps) (res '()) (set the-empty-set))
  300. (if (null? exps)
  301. (cond ((null? res)
  302. set)
  303. ((and (null? (cdr res))
  304. (empty-set? set))
  305. (car res))
  306. (else
  307. (make-one-of (if (empty-set? set)
  308. (reverse res)
  309. (cons set (reverse res))))))
  310. (let ((exp (car exps)))
  311. (cond ((not (regexp? exp))
  312. (apply call-error "invalid argument" sequence all-exps))
  313. ((empty-set? exp)
  314. (loop (cdr exps) res set))
  315. ((set? exp)
  316. (loop (cdr exps) res (union exp set)))
  317. ((one-of? exp)
  318. (loop (append (one-of-exps exp) (cdr exps)) res set))
  319. (else
  320. (loop (cdr exps) (cons exp res) set)))))))
  321. ; (TEXT <string>) matches the <string>.
  322. (define (text string)
  323. (if (string? string)
  324. (apply sequence (map char->set (string->list string)))
  325. (call-error "invalid argument" text string)))
  326. ; Repetitions
  327. ; LOW is an integer >= 0.
  328. ; HIGH is either an integer >= LOW or #F, indicating that there is no limit.
  329. ; EXP is the expression to be repeated.
  330. (define-regexp-type repeat really-make-repeat repeat?
  331. repeat-low
  332. repeat-high
  333. repeat-exp)
  334. ; Optimizations:
  335. ; - if both high and low are 0 this is the empty set
  336. ; - if both high and low are 1 this is just the expression
  337. ; - the empty set is epsilon if zero repetitions are allowed, or itself
  338. ; if at least one repetition is required.
  339. (define (make-repeat low high exp)
  340. (cond ((not (and (integer? low)
  341. (<= 0 low)
  342. (regexp? exp)
  343. (or (not high)
  344. (and (integer? high)
  345. (<= low high)))))
  346. (call-error "invalid argument" make-repeat low high exp))
  347. ((or (epsilon? exp)
  348. (and high
  349. (= low high 0)))
  350. epsilon)
  351. ((and high
  352. (= low high 1))
  353. exp)
  354. ((empty-set? exp)
  355. (if (and (= low 0)
  356. (or (not high)
  357. (= high 0)))
  358. epsilon
  359. the-empty-set))
  360. (else
  361. (really-make-repeat low high exp))))
  362. ; One argument is an expression that can appear any number of times.
  363. ; Two arguments are LOW and an expression.
  364. ; Three arguments are LOW, HIGH, and an expression.
  365. (define (repeat . stuff)
  366. (case (length stuff)
  367. ((0)
  368. (call-error "invalid argument" repeat))
  369. ((1)
  370. (make-repeat 0 #f (car stuff)))
  371. ((2)
  372. (make-repeat (car stuff) (car stuff) (cadr stuff)))
  373. ((3)
  374. (apply make-repeat stuff))
  375. (else
  376. (apply call-error "invalid argument" repeat stuff))))
  377. ;----------------
  378. ; Three functions that transform EXP instead of having their own record type.
  379. ; They are idempotent: (eq? (f x) (f (f x))) => #t.
  380. (define (ignore-case exp)
  381. (start-expression-map ignore-case 'no-case exp))
  382. (define (use-case exp)
  383. (start-expression-map use-case 'use-case exp))
  384. (define (no-submatches exp)
  385. (start-expression-map no-submatches 'no-submatch exp))
  386. ; FUNCTION is one of:
  387. ; no-case - replace use-case with no-case in all sets
  388. ; use-case - replace no-case with use-case in all sets
  389. ; no-submatch - remove all submatches
  390. ; This reuses as much of EXP as possible.
  391. (define (start-expression-map proc function exp)
  392. (if (regexp? exp)
  393. (or (expression-map function exp)
  394. exp)
  395. (call-error "invalid argument" proc exp)))
  396. ; This returns #F if FUNCTION does not modify EXP.
  397. (define (expression-map function exp)
  398. (let recur ((exp exp))
  399. (cond ((set? exp)
  400. (if (or (eq? function 'no-submatch)
  401. (= (set-use-case exp)
  402. (set-no-case exp)))
  403. #f
  404. (let ((chars (if (eq? function 'no-case)
  405. (set-no-case exp)
  406. (set-use-case exp))))
  407. (make-set chars chars))))
  408. ((submatch? exp)
  409. (let ((sub (submatch-exp exp)))
  410. (if (eq? function 'no-submatch)
  411. (or (recur sub)
  412. sub)
  413. (let ((new (recur sub)))
  414. (if new
  415. (submatch (submatch-id exp) new)
  416. #f)))))
  417. ((sequence? exp)
  418. (let ((exps (expression-list-map function (sequence-exps exp))))
  419. (if exps
  420. (make-sequence exps)
  421. #f)))
  422. ((one-of? exp)
  423. (let ((exps (expression-list-map function (one-of-exps exp))))
  424. (if exps
  425. (make-one-of exps)
  426. #f)))
  427. ((repeat? exp)
  428. (let ((new (recur (repeat-exp exp))))
  429. (if new
  430. (make-repeat (repeat-low exp)
  431. (repeat-high exp)
  432. new)
  433. #f)))
  434. ((or (string-start? exp)
  435. (string-end? exp))
  436. exp)
  437. (else
  438. (error "expression-map got a non-expression" exp)))))
  439. ; As above, this returns #F if FUNCTION leaves EXPS unchanged.
  440. (define (expression-list-map function exps)
  441. (let recur ((exps exps))
  442. (if (null? exps)
  443. #f
  444. (let ((new (expression-map function (car exps)))
  445. (more (recur (cdr exps))))
  446. (if (or new more)
  447. (cons (or new (car exps))
  448. (or more (cdr exps)))
  449. #f)))))
  450. ;----------------
  451. ; Translation to a POSIX regexp string.
  452. ;
  453. ; We get the string from EXP->STRINGS as a list of substrings to concatenate
  454. ; together. In this list (<id>) represents a #\( that begins a submatch with
  455. ; the given id and #F represents a #\( that does not begin a submatch. We walk
  456. ; down the list replacing these with #\( and collecting the ids and #f's into a
  457. ; list. When matching this list will be used to discard the match records that
  458. ; correspond to parens used for grouping while retaining those that correspond
  459. ; to submatches.
  460. ;
  461. ; This used to signal an error if a beginning-of-line or end-of-line occured
  462. ; in an unmatchable position. This is a questionable notion and the checking
  463. ; was not completely accurate, so I removed it.
  464. (define (exp->posix-string exp)
  465. (cond ((not (regexp? exp))
  466. (call-error "invalid argument" exp->posix-string exp))
  467. ((empty-set? exp)
  468. (call-error "no Posix string for the empty set" exp->posix-string exp))
  469. (else
  470. (reduce ((list* elt (exp->strings exp '())))
  471. ((strings '())
  472. (parens '()))
  473. (cond ((not elt)
  474. (values (cons "(" strings) (cons #f parens)))
  475. ((pair? elt)
  476. (values (cons "(" strings) (cons elt parens)))
  477. (else
  478. (values (cons elt strings) parens)))
  479. (values (apply string-append (reverse strings))
  480. (reverse parens))))))
  481. ; submatch -> (...), no need to parenthesize the subexp
  482. ; one-of -> ... | ... | ..., no need to parenthesize the subexps
  483. ; seq -> concatenation, need to parenthesize one-ofs
  484. ; repeat -> ...{x,y}, parenthesize sequences and one-ofs
  485. ;
  486. ; TAIL is a list of strings to come after those for EXP.
  487. (define (exp->strings exp tail)
  488. (cond ((set? exp)
  489. (cons (set->posix-string exp) tail))
  490. ((submatch? exp)
  491. `((,(submatch-id exp))
  492. ,@(exp->strings (submatch-exp exp) '())
  493. ")"
  494. . ,tail))
  495. ((one-of? exp)
  496. (one-of-strings (one-of-exps exp) tail))
  497. ((sequence? exp)
  498. (sequence-strings (sequence-exps exp) tail))
  499. ((repeat? exp)
  500. (repetition-strings (repeat-low exp)
  501. (repeat-high exp)
  502. (repeat-exp exp)
  503. tail))
  504. ((string-start? exp)
  505. (cons "^" tail))
  506. ((string-end? exp)
  507. (cons "$" tail))
  508. (else
  509. (error "bad expression" exp))))
  510. ; Add parentheses around the strings for EXP, encoding "(" as #F because it
  511. ; does not begin a submatch.
  512. (define (exp->parenthesized-strings exp tail)
  513. (cons #f (exp->strings exp (cons ")" tail))))
  514. ; Convert EXPS to strings, adding "|" between them. This depends on there
  515. ; being at least two EXPS.
  516. (define (one-of-strings exps tail)
  517. (let ((exps (reverse exps)))
  518. (reduce ((list* exp (cdr exps)))
  519. ((res (exp->strings (car exps) tail)))
  520. (exp->strings exp (cons "|" res)))))
  521. ; Convert EXPS to strings, adding parentheses for any that are ONE-OFs.
  522. (define (sequence-strings exps tail)
  523. (if (null? exps)
  524. `(#f ")" . ,tail)
  525. (reduce ((list* exp (reverse exps)))
  526. ((strings tail))
  527. (if (one-of? exp)
  528. (exp->parenthesized-strings exp strings)
  529. (exp->strings exp strings)))))
  530. ; e{x,} X or more Es
  531. ; e{x} exactly X Es
  532. ; e{x,y} between X and Y Es, inclusive
  533. (define (repetition-strings low high subexp tail)
  534. (let ((tail `("{"
  535. ,(number->string low)
  536. ,@(cond ((not high)
  537. (list ","))
  538. ((= low high)
  539. '())
  540. (else
  541. (list "," (number->string high))))
  542. "}"
  543. . ,tail)))
  544. (if (or (one-of? subexp)
  545. (sequence? subexp))
  546. (exp->parenthesized-strings subexp tail)
  547. (exp->strings subexp tail))))
  548. ;----------------
  549. ; Converting a set into POSIX regexp [...] sets. We have to avoid various
  550. ; pitfalls of the notation, which does not use a simple escape mechanism.
  551. ; The operator characters lose their meaning when in particular positions:
  552. ; ^ when first indicates negation, anywhere else it's normal
  553. ; ] is normal when first or second after ^
  554. ; - is normal as the first or last character
  555. ; [ is normal except when followed by any one of three characters: .=:
  556. ;
  557. ; 1. If it is everything, use `.'
  558. ; 2. If it a singleton just use the character, escaped if necessary.
  559. ; 3. If the set is -^ then use "[-^]".
  560. ; 4. If - is in the set and not inside a range, then put it last.
  561. ; 5. If ^ is " , then put it before ^^^.
  562. ; 6. If [ is " , then put it before ^^^.
  563. ; 7. If ] is " , then put it first.
  564. ;
  565. ; The code would be simpler if we didn't preserve ranges across funny
  566. ; characters.
  567. ;
  568. ; The computation is expensive enough that we cache the result. This is a
  569. ; win for the named sets (alphabetic etc.) and will also help when people use
  570. ; subexpressions multiple times.
  571. (define (set->posix-string set)
  572. (or (set-string set)
  573. (let ((string (bit-set->posix-string (set-use-case set))))
  574. (set-set-string! set string)
  575. string)))
  576. ; Pick off empty sets and full sets, then negate if necessary and build the
  577. ; string. MAYBE-BIT-SET->STRING returns a character if the set is a singleton.
  578. (define (bit-set->posix-string bit-set)
  579. (cond ((= bit-set 0)
  580. (error "trying to convert the empty set"))
  581. ((= (bitwise-and bit-set all-chars)
  582. all-chars)
  583. ".")
  584. (else
  585. (let* ((string (maybe-bit-set->string bit-set)))
  586. (if (string? string)
  587. (string-append "["
  588. (if (char? string)
  589. (list->string (list string))
  590. string)
  591. "]")
  592. (char->posix-string string))))))
  593. ; The general rule does not work for "[-^]" (it would come out as "[^-]").
  594. (define dash-hat (bitwise-ior (char->mask #\-)
  595. (char->mask #\^)))
  596. ; This is a list of three element lists:
  597. ; - a bitmask for a character that has special meaning
  598. ; - the character itself
  599. ; - a mask with the character and two or three adjacent characters
  600. ; If the character is present but not all of the bigger mask, then the character
  601. ; will not be inside a range and so must be removed and placed at the end of the
  602. ; string. This is complicated by #\] and #\^ having adjacent scalar values.
  603. ; If one is present then the four characters "\]^_" must be present to make sure
  604. ; the funny character is inside a range.
  605. ;
  606. ; The bitmask comes first so that we can use ASSOC to search the list.
  607. (define funny-char-list
  608. (map (lambda (char)
  609. (let ((mask (char->mask char)))
  610. (list mask
  611. char
  612. (bitwise-ior mask
  613. (arithmetic-shift mask -1)
  614. (arithmetic-shift mask 1)
  615. (cond ((eq? char #\^)
  616. (char->mask #\\))
  617. ((eq? char #\])
  618. (char->mask #\_))
  619. (else
  620. 0))))))
  621. '(#\[ #\^ #\- #\])))
  622. ; Pick off "-^" and any funny characters on their own. Otherwise we remove
  623. ; any funny characters that will not be internal to a range and then add them
  624. ; back in at the end. The order of the funny character data makes it so that
  625. ; any subset of them is an unambiguous end for the set (except for #\] which
  626. ; goes at the beginning).
  627. (define (maybe-bit-set->string bit-set)
  628. (cond ((= bit-set dash-hat)
  629. "-^")
  630. ((assoc bit-set funny-char-list)
  631. => cadr)
  632. (else
  633. (reduce ((list* info funny-char-list))
  634. ((bit-set bit-set)
  635. (funny-chars '()))
  636. (if (or (= 0
  637. (bitwise-and bit-set
  638. (car info)))
  639. (= (caddr info)
  640. (bitwise-and bit-set
  641. (caddr info))))
  642. (values bit-set
  643. funny-chars)
  644. (values (bitwise-xor bit-set (car info))
  645. (cons (cadr info) funny-chars)))
  646. (let ((chars (bit-set->chars bit-set)))
  647. (char-list->string (if (and (not (null? funny-chars))
  648. (eq? (car funny-chars) #\]))
  649. (append (cdr funny-chars)
  650. (reverse (cons #\] chars)))
  651. (append funny-chars
  652. (reverse chars)))))))))
  653. ; Returns a list of pairs (char0 . char1), each pair representing an inclusive
  654. ; range of characters in the bitset. We first make a list of possibly adjacent
  655. ; ranges and then merge the adjacent ones. Believe it or not, the code is
  656. ; much simpler this way.
  657. (define (bit-set->chars bit-set)
  658. (reduce ((bits* chunk bit-set 16)
  659. (count* i 0 -1 16))
  660. ((ranges '()))
  661. (if (= 0 chunk)
  662. ranges
  663. (small-bit-set->ranges chunk i ranges))
  664. (reduce ((list* range (cdr ranges)))
  665. ((done (list (car ranges))))
  666. (if (= (+ (cdr range) 1)
  667. (caar done))
  668. (cons (cons (car range) (cdar done))
  669. (cdr done))
  670. (cons range done)))))
  671. ; Here we loop through the bits creating ranges. REDUCE doesn't work because
  672. ; we need both the next bit and the remaining bit-set.
  673. (define (small-bit-set->ranges bit-set i result)
  674. (let loop ((bit-set bit-set) (i i))
  675. (cond ((= 0 bit-set)
  676. result)
  677. ((odd? bit-set)
  678. (let range-loop ((bit-set (shift-down bit-set)) (j (+ i 1)))
  679. (if (odd? bit-set)
  680. (range-loop (shift-down bit-set) (+ j 1))
  681. (small-bit-set->ranges (shift-down bit-set)
  682. (+ j 1)
  683. (cons (cons i (- j 1))
  684. result)))))
  685. (else
  686. (loop (shift-down bit-set) (+ i 1))))))
  687. (define (shift-down n)
  688. (arithmetic-shift n -1))
  689. ; Turn RANGES, which is list of pairs and
  690. ; (<scalar value start> . <scalar value end>) ranges
  691. ; into a string, where the ranges become <start char>-<end char>. Any
  692. ; characters in RANGES are put directly into the result.
  693. (define (char-list->string ranges)
  694. (if (and (null? (cdr ranges))
  695. (= (caar ranges)
  696. (cdar ranges)))
  697. (string (scalar-value->char (caar ranges)))
  698. (reduce ((list* range ranges))
  699. ((res '()))
  700. (if (char? range)
  701. (cons range res)
  702. (let ((first (scalar-value->char (car range)))
  703. (last (scalar-value->char (cdr range))))
  704. (case (- (cdr range) (car range))
  705. ((0)
  706. (cons first res))
  707. ((1)
  708. (cons first (cons last res)))
  709. (else
  710. `(,first #\- ,last . ,res)))))
  711. (list->string res))))
  712. ; These are the characters that need to be escaped when appearing in an
  713. ; expression (but not necessarily when in a character set).
  714. (define special-char-set
  715. (set-use-case (set "[.*?()|\\$^+")))
  716. ; Add a \ in front of CHAR if it is an operator.
  717. ;
  718. ; To avoid sequences like "{3", which look like parts of ranges, we always put
  719. ; #\{ characters in ranges.
  720. (define (char->posix-string char)
  721. (cond ((char=? char #\{)
  722. "[{]")
  723. ((= 0 (bitwise-and (char->mask char)
  724. special-char-set))
  725. (string char))
  726. (else
  727. (string #\\ char))))
  728. ;----------------
  729. ; Matching
  730. ;
  731. ; A compiled regexp is actually a pair of a POSIX regexp object and the list
  732. ; of submatch indexes.
  733. (define (any-match? exp string)
  734. (if (and (regexp? exp)
  735. (string? string))
  736. (if (empty-set? exp)
  737. #f
  738. (regexp-match (car (compile-exp exp)) string 0 #f #t #t))
  739. (call-error "invalid argument" any-match? exp string)))
  740. (define (exact-match? exp string)
  741. (if (and (regexp? exp)
  742. (string? string))
  743. (if (empty-set? exp)
  744. #f
  745. (let ((matches (regexp-match (car (compile-exp exp)) string 0 #t #t #t)))
  746. (and matches
  747. (= 0 (match-start (car matches)))
  748. (= (string-length string) (match-end (car matches))))))
  749. (call-error "invalid argument" exact-match? exp string)))
  750. ; Do the match and select out the match records that correspond to submatches,
  751. ; making them into an alist. The first match record, which is for the entire
  752. ; expression is remade with the alist as an additional field.
  753. (define (match exp string)
  754. (if (and (regexp? exp)
  755. (string? string))
  756. (let* ((pair (compile-exp exp))
  757. (regexp (car pair))
  758. (match-flags (cdr pair))
  759. (matches (regexp-match regexp string 0 #t #t #t)))
  760. (if matches
  761. (reduce ((list% match (cdr matches))
  762. (list% flag match-flags))
  763. ((submatches '()))
  764. (if (and flag match)
  765. (cons (cons (car flag)
  766. match)
  767. submatches)
  768. submatches)
  769. (make-match (match-start (car matches))
  770. (match-end (car matches))
  771. (reverse submatches)))
  772. #f))
  773. (call-error "invalid argument" match exp string)))
  774. ; Compile the expression if this hasn't already been done. The compiled version
  775. ; is the POSIX regexp object and the list of submatch indexes.
  776. (define (compile-exp exp)
  777. (or (regexp-compiled exp)
  778. (mvlet* (((string match-flags)
  779. (exp->posix-string exp))
  780. ;; must always have submatches on, needed by EXACT-MATCH?
  781. (regexp (make-posix-regexp string
  782. (regexp-option extended)
  783. (regexp-option submatches))))
  784. (set-regexp-compiled! exp (cons regexp match-flags))
  785. (cons regexp match-flags))))
  786. ;----------------
  787. ; A handy debugging function. In theory this returns as S-expression that
  788. ; when EVAL'ed would give back the original expression. It does not produce
  789. ; a particularly compact s-expression.
  790. (define (regexp->s-exp x)
  791. (cond ((not (regexp? x))
  792. (call-error "invalid argument" regexp->s-exp x))
  793. ((set? x)
  794. (list 'set
  795. (let ((chars (set->chars x)))
  796. (if (= 1 (length chars))
  797. (car chars)
  798. (list->string chars)))))
  799. ((submatch? x)
  800. (list 'submatch (regexp->s-exp (submatch-exp x))))
  801. ((sequence? x)
  802. (cons 'sequence (map regexp->s-exp (sequence-exps x))))
  803. ((one-of? x)
  804. (cons 'one-of (map regexp->s-exp (one-of-exps x))))
  805. ((repeat? x)
  806. (list 'repeat
  807. (repeat-low x)
  808. (repeat-high x)
  809. (regexp->s-exp (repeat-exp x))))
  810. ((string-start? x)
  811. '(string-start))
  812. ((string-end? x)
  813. '(string-end))
  814. (else
  815. (error "unknown type of regular-expression" x))))
  816. ; Used by EXP->S-EXP. Returns a list of the characters in SET (using the
  817. ; case-sensitive set).
  818. (define (set->chars set)
  819. (iterate loop ((count* i 0 -1 16))
  820. ((bits (set-use-case set))
  821. (chars '()))
  822. (if (= 0 bits)
  823. (reverse chars)
  824. (loop (arithmetic-shift bits -16)
  825. (iterate loop ((count* i i))
  826. ((bits (bitwise-and bits #xffff))
  827. (chars chars))
  828. (if (= 0 bits)
  829. chars
  830. (loop (arithmetic-shift bits -1)
  831. (if (odd? bits)
  832. (cons (scalar-value->char i)
  833. chars)
  834. chars))))))))