func-regexp.scm 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964
  1. ; Part of Scheme 48 1.9. See file COPYING for notices and license.
  2. ; Authors: Richard Kelsey, Jonathan Rees, Mike Sperber
  3. ; All regular expressions are records of the following type.
  4. ;
  5. ; type - a symbol indicating which type of regular expression this is.
  6. ; compiled - cache for the compiled Posix regular expression; initially #F
  7. ; field0 - data fields containing
  8. ; field1 - type-specific
  9. ; field2 - information
  10. ;
  11. ; Regular expressions are not modified, except for adding cached Posix values.
  12. ;
  13. ; The different types of regular expressions are:
  14. ; Set
  15. ; use-case - character bitmask for case-sensitive matching
  16. ; no-case - character bitmask for case-insensitive matching
  17. ; string - cached Posix regular expression for this set
  18. ; String-Start
  19. ; String-End
  20. ; Submatch
  21. ; subexp - the subexpression whose match should be reported
  22. ; id - EQ? tag used to identify the substring that matched
  23. ; Sequence
  24. ; subexps - list subexpressions to be matched in order
  25. ; One-Of
  26. ; subexps - list of subexpressions, any one of which is to match
  27. ; Repeat
  28. ; low - minimum count
  29. ; high - maximum count, or #f if unbounded
  30. ; subexp - subexpression to be repeated
  31. (define-record-type regexp :regexp
  32. (make-regexp type compiled field0 field1 field2)
  33. regexp?
  34. (type regexp-type)
  35. (compiled regexp-compiled set-regexp-compiled!)
  36. (field0 regexp-field0)
  37. (field1 regexp-field1)
  38. (field2 regexp-field2 set-regexp-field2!))
  39. (define-record-discloser :regexp
  40. (lambda (regexp)
  41. (list 'regexp (regexp-type regexp))))
  42. ; A goofy macro to make the expression-type definitions more readable.
  43. (define-syntax define-regexp-type
  44. (syntax-rules ()
  45. ((define-regexp-type name maker predicate)
  46. (begin
  47. (define (maker) (make-regexp 'name #f #f #f #f))
  48. (define (predicate regexp)
  49. (eq? (regexp-type regexp) 'name))))
  50. ((define-regexp-type name maker predicate slot0)
  51. (begin
  52. (define (maker x)
  53. (make-regexp 'name #f x #f #f))
  54. (define (predicate regexp)
  55. (eq? (regexp-type regexp) 'name))
  56. (define slot0 regexp-field0)))
  57. ((define-regexp-type name maker predicate slot0 slot1)
  58. (begin
  59. (define (maker x y) (make-regexp 'name #f x y #f))
  60. (define (predicate regexp) (eq? (regexp-type regexp) 'name))
  61. (define slot0 regexp-field0)
  62. (define slot1 regexp-field1)))
  63. ((define-regexp-type name maker predicate slot0 slot1 slot2)
  64. (begin
  65. (define (maker x y z) (make-regexp 'name #f x y z))
  66. (define (predicate regexp) (eq? (regexp-type regexp) 'name))
  67. (define slot0 regexp-field0)
  68. (define slot1 regexp-field1)
  69. (define slot2 regexp-field2)))))
  70. ; Character sets
  71. ;
  72. ; Each has two bitsets, one which is the case-sensitive version of the set and
  73. ; the other the case-insensitive.
  74. ;
  75. ; There are two cached values, the POSIX regexp string for the set and the
  76. ; actual regexp object. The second is set only if this set is the entire
  77. ; expression being matched.
  78. (define-regexp-type set really-make-set set?
  79. set-use-case
  80. set-no-case
  81. set-string)
  82. (define set-set-string! set-regexp-field2!)
  83. (define (make-set case no-case)
  84. (really-make-set case no-case #f))
  85. (define the-empty-set
  86. (really-make-set 0 0 #f))
  87. (define (empty-set? set)
  88. (and (set? set)
  89. (= 0 (set-use-case set))
  90. (= 0 (set-no-case set))))
  91. (define (char->mask char)
  92. (arithmetic-shift 1 (char->scalar-value char)))
  93. (define (char-in-set? char set)
  94. (not (zero? (bitwise-and (set-use-case set)
  95. (char->mask char)))))
  96. (define char-limit 256) ; allow eight-bit characters
  97. ; A vector mapping Latin-1 values to case-insensitive bitsets.
  98. ; It's unclear how to make this work with Unicode while preserving
  99. ; the underlying 8-bit POSIX API.
  100. (define no-case-char-masks
  101. (reduce ((count* i 0 char-limit))
  102. ((masks '()))
  103. (cons (let ((ch (scalar-value->char i)))
  104. (bitwise-ior (arithmetic-shift 1 i)
  105. (cond ((and (< i 128)
  106. (char-upper-case? ch))
  107. (char->mask (char-downcase ch)))
  108. ((and (< i 128)
  109. (char-lower-case? ch))
  110. (char->mask (char-upcase ch)))
  111. (else
  112. 0))))
  113. masks)
  114. (list->vector (reverse masks))))
  115. ; A vector of the single-character sets.
  116. (define singleton-sets
  117. (reduce ((count* i 0 char-limit)) ; allow eight-bit characters
  118. ((sets '()))
  119. (cons (make-set (arithmetic-shift 1 i)
  120. (vector-ref no-case-char-masks i))
  121. sets)
  122. (list->vector (reverse sets))))
  123. (define (char->set char)
  124. (vector-ref singleton-sets (char->scalar-value char)))
  125. ; Arguments can be strings or single characters. We walk down all of the
  126. ; characters, or-ing their masks together.
  127. (define (set . all-args)
  128. (if (and (= 1 (length all-args))
  129. (char? (car all-args)))
  130. (char->set (car all-args))
  131. (reduce ((list* arg all-args))
  132. ((case 0)
  133. (no-case 0))
  134. (cond ((char? arg)
  135. (add-char-masks arg case no-case))
  136. ((string? arg)
  137. (add-string-masks arg case no-case))
  138. (else
  139. (apply assertion-violation 'set "invalid argument" all-args)))
  140. (make-set case no-case))))
  141. (define (add-char-masks char case no-case)
  142. (values (bitwise-ior case (char->mask char))
  143. (bitwise-ior no-case
  144. (vector-ref no-case-char-masks
  145. (char->scalar-value char)))))
  146. (define (add-string-masks string case no-case)
  147. (reduce ((string* char string))
  148. ((case case)
  149. (no-case no-case))
  150. (add-char-masks char case no-case)))
  151. ; Ranges. Again, we loop through the ranges building up two masks.
  152. (define (range low high)
  153. (or (real-ranges `(,low ,high) char->integer integer->scalar-value)
  154. (assertion-violation 'range "invalid argument" low high)))
  155. (define (ranges . limits)
  156. (or (real-ranges limits char->integer integer->scalar-value)
  157. (apply assertion-violation 'ranges "invalid argument" limits)))
  158. (define (ascii-range low high)
  159. (or (real-ranges `(,low ,high) char->ascii identity)
  160. (assertion-violation 'ascii-range "invalid argument" low high)))
  161. (define (ascii-ranges . limits)
  162. (or (real-ranges limits char->ascii identity)
  163. (apply assertion-violation 'ascii-ranges "invalid argument" limits)))
  164. (define (integer->scalar-value i)
  165. (char->scalar-value (integer->char i)))
  166. (define (identity i)
  167. i)
  168. ; LIMITS is a list of lists (<start-char> <end-char>), CHAR->INT returns an
  169. ; integer given a character and INT->SCALAR-VALUE translates that integer to the
  170. ; corresponding scalar value.
  171. (define (real-ranges limits char->int int->scalar-value)
  172. (if (every char? limits)
  173. (let loop ((to-do limits) (case 0) (no-case 0))
  174. (cond ((null? to-do)
  175. (make-set case no-case))
  176. ((null? (cdr to-do))
  177. #f)
  178. (else
  179. (let ((start (char->int (car to-do)))
  180. (end (char->int (cadr to-do))))
  181. (if (< end start)
  182. #f
  183. (reduce ((count* i start (+ end 1)))
  184. ((case case)
  185. (no-case no-case))
  186. (let ((scalar-value (int->scalar-value i)))
  187. (values (bitwise-ior case
  188. (arithmetic-shift 1 scalar-value))
  189. (bitwise-ior no-case
  190. (vector-ref no-case-char-masks
  191. scalar-value))))
  192. (loop (cddr to-do) case no-case)))))))
  193. #f))
  194. ; The mask with all ones.
  195. (define all-chars (- (arithmetic-shift 1 char-limit) 1))
  196. (define all-chars-except-nul (- all-chars 1))
  197. ; Set operations.
  198. (define (negate set)
  199. (make-set (bitwise-xor all-chars-except-nul (set-use-case set))
  200. (bitwise-xor all-chars-except-nul (set-no-case set))))
  201. (define (set-binop op)
  202. (lambda (set1 set2)
  203. (make-set (op (set-use-case set1)
  204. (set-use-case set2))
  205. (op (set-no-case set1)
  206. (set-no-case set2)))))
  207. (define intersection (set-binop bitwise-and))
  208. (define union (set-binop bitwise-ior))
  209. (define subtract
  210. (set-binop (lambda (x y)
  211. (bitwise-xor x
  212. (bitwise-and x y)))))
  213. ; Predefined sets. These are from the LC_CTYPE category in the POSIX
  214. ; Locale, which is the nearest POSIX gets to defining character classes.
  215. (define lower-case (range #\a #\z))
  216. (define upper-case (range #\A #\Z))
  217. (define alphabetic (union lower-case upper-case))
  218. (define numeric (range #\0 #\9))
  219. (define alphanumeric (union alphabetic numeric))
  220. (define punctuation (set "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"))
  221. (define blank (set #\space (scalar-value->char 9))) ;#\tab
  222. (define graphic (union alphanumeric punctuation))
  223. (define whitespace (apply set (map scalar-value->char
  224. '(32 ;space
  225. 9 ;tab
  226. 10 ;newline
  227. 11 ;vertical tab
  228. 12 ;form feed
  229. 13)))) ;return
  230. (define printing (union graphic (set #\space)))
  231. (define control (negate printing))
  232. (define hexdigit (union numeric (ranges #\a #\f #\A #\F)))
  233. ;----------------
  234. ; String beginning and end anchors.
  235. (define-regexp-type string-start make-string-start string-start?)
  236. (define-regexp-type string-end make-string-end string-end?)
  237. (define the-string-start (make-string-start))
  238. (define the-string-end (make-string-end))
  239. (define (string-start)
  240. the-string-start)
  241. (define (string-end)
  242. the-string-end)
  243. ;----------------
  244. ; Expression types.
  245. ;
  246. ; There are four records for expressions: submatches, sequences, one-ofs,
  247. ; and repeats. The first and last have a single subexpression, the middle
  248. ; two each have a list.
  249. ;
  250. ; Each record contains a slot for caching a POSIX regexp object. This is
  251. ; set only if that record is the entire expression being matched.
  252. ; A submatch is just a marker wrapped around another expression, with a tag
  253. ; used to identify the matched substring.
  254. (define-regexp-type submatch make-submatch submatch?
  255. submatch-exp
  256. submatch-id)
  257. (define (submatch id exp)
  258. (cond ((not (regexp? exp))
  259. (assertion-violation 'submatch "invalid argument" exp))
  260. ((empty-set? exp)
  261. the-empty-set)
  262. (else
  263. (make-submatch exp id))))
  264. ; Sequences.
  265. (define-regexp-type sequence make-sequence sequence?
  266. sequence-exps)
  267. ; Epsilon is encoded as an empty sequence.
  268. (define epsilon (make-sequence '()))
  269. (define (epsilon? x)
  270. (and (sequence? x)
  271. (null? (sequence-exps x))))
  272. ; We splice out any subordinate sequences. An empty-set means that the sequence
  273. ; itself can never match and is thus the empty-set as well.
  274. ;
  275. ; We can't use REDUCE because we add on to the EXPS list.
  276. (define (sequence . all-exps)
  277. (let loop ((exps all-exps) (res '()))
  278. (if (null? exps)
  279. (cond ((null? res)
  280. epsilon)
  281. ((null? (cdr res))
  282. (car res))
  283. (else
  284. (make-sequence (reverse res))))
  285. (let ((exp (car exps)))
  286. (cond ((not (regexp? exp))
  287. (apply assertion-violation 'sequence "invalid argument" all-exps))
  288. ((empty-set? exp)
  289. the-empty-set)
  290. ((sequence? exp)
  291. (loop (append (sequence-exps exp) (cdr exps)) res))
  292. (else
  293. (loop (cdr exps) (cons exp res))))))))
  294. ; one-ofs
  295. (define-regexp-type one-of make-one-of one-of?
  296. one-of-exps)
  297. ; We splice out any subordinate one-ofs and merge all sets into a single set.
  298. ; An empty one-of is the empty set. Any subordinate empty-sets can be dropped.
  299. (define (one-of . all-exps)
  300. (let loop ((exps all-exps) (res '()) (set the-empty-set))
  301. (if (null? exps)
  302. (cond ((null? res)
  303. set)
  304. ((and (null? (cdr res))
  305. (empty-set? set))
  306. (car res))
  307. (else
  308. (make-one-of (if (empty-set? set)
  309. (reverse res)
  310. (cons set (reverse res))))))
  311. (let ((exp (car exps)))
  312. (cond ((not (regexp? exp))
  313. (apply assertion-violation 'one-of "invalid argument" all-exps))
  314. ((empty-set? exp)
  315. (loop (cdr exps) res set))
  316. ((set? exp)
  317. (loop (cdr exps) res (union exp set)))
  318. ((one-of? exp)
  319. (loop (append (one-of-exps exp) (cdr exps)) res set))
  320. (else
  321. (loop (cdr exps) (cons exp res) set)))))))
  322. ; (TEXT <string>) matches the <string>.
  323. (define (text string)
  324. (if (string? string)
  325. (apply sequence (map char->set (string->list string)))
  326. (assertion-violation 'text "invalid argument" string)))
  327. ; Repetitions
  328. ; LOW is an integer >= 0.
  329. ; HIGH is either an integer >= LOW or #F, indicating that there is no limit.
  330. ; EXP is the expression to be repeated.
  331. (define-regexp-type repeat really-make-repeat repeat?
  332. repeat-low
  333. repeat-high
  334. repeat-exp)
  335. ; Optimizations:
  336. ; - if both high and low are 0 this is the empty set
  337. ; - if both high and low are 1 this is just the expression
  338. ; - the empty set is epsilon if zero repetitions are allowed, or itself
  339. ; if at least one repetition is required.
  340. (define (make-repeat low high exp)
  341. (cond ((not (and (integer? low)
  342. (<= 0 low)
  343. (regexp? exp)
  344. (or (not high)
  345. (and (integer? high)
  346. (<= low high)))))
  347. (assertion-violation 'make-repeat "invalid argument" low high exp))
  348. ((or (epsilon? exp)
  349. (and high
  350. (= low high 0)))
  351. epsilon)
  352. ((and high
  353. (= low high 1))
  354. exp)
  355. ((empty-set? exp)
  356. (if (and (= low 0)
  357. (or (not high)
  358. (= high 0)))
  359. epsilon
  360. the-empty-set))
  361. (else
  362. (really-make-repeat low high exp))))
  363. ; One argument is an expression that can appear any number of times.
  364. ; Two arguments are LOW and an expression.
  365. ; Three arguments are LOW, HIGH, and an expression.
  366. (define (repeat . stuff)
  367. (case (length stuff)
  368. ((0)
  369. (assertion-violation 'repeat "invalid argument" repeat))
  370. ((1)
  371. (make-repeat 0 #f (car stuff)))
  372. ((2)
  373. (make-repeat (car stuff) (car stuff) (cadr stuff)))
  374. ((3)
  375. (apply make-repeat stuff))
  376. (else
  377. (apply assertion-violation 'repeat "invalid argument" stuff))))
  378. ;----------------
  379. ; Three functions that transform EXP instead of having their own record type.
  380. ; They are idempotent: (eq? (f x) (f (f x))) => #t.
  381. (define (ignore-case exp)
  382. (start-expression-map ignore-case 'no-case exp))
  383. (define (use-case exp)
  384. (start-expression-map use-case 'use-case exp))
  385. (define (no-submatches exp)
  386. (start-expression-map no-submatches 'no-submatch exp))
  387. ; FUNCTION is one of:
  388. ; no-case - replace use-case with no-case in all sets
  389. ; use-case - replace no-case with use-case in all sets
  390. ; no-submatch - remove all submatches
  391. ; This reuses as much of EXP as possible.
  392. (define (start-expression-map proc function exp)
  393. (if (regexp? exp)
  394. (or (expression-map function exp)
  395. exp)
  396. (assertion-violation 'start-expression-map "invalid argument" proc exp)))
  397. ; This returns #F if FUNCTION does not modify EXP.
  398. (define (expression-map function exp)
  399. (let recur ((exp exp))
  400. (cond ((set? exp)
  401. (if (or (eq? function 'no-submatch)
  402. (= (set-use-case exp)
  403. (set-no-case exp)))
  404. #f
  405. (let ((chars (if (eq? function 'no-case)
  406. (set-no-case exp)
  407. (set-use-case exp))))
  408. (make-set chars chars))))
  409. ((submatch? exp)
  410. (let ((sub (submatch-exp exp)))
  411. (if (eq? function 'no-submatch)
  412. (or (recur sub)
  413. sub)
  414. (let ((new (recur sub)))
  415. (if new
  416. (submatch (submatch-id exp) new)
  417. #f)))))
  418. ((sequence? exp)
  419. (let ((exps (expression-list-map function (sequence-exps exp))))
  420. (if exps
  421. (make-sequence exps)
  422. #f)))
  423. ((one-of? exp)
  424. (let ((exps (expression-list-map function (one-of-exps exp))))
  425. (if exps
  426. (make-one-of exps)
  427. #f)))
  428. ((repeat? exp)
  429. (let ((new (recur (repeat-exp exp))))
  430. (if new
  431. (make-repeat (repeat-low exp)
  432. (repeat-high exp)
  433. new)
  434. #f)))
  435. ((or (string-start? exp)
  436. (string-end? exp))
  437. exp)
  438. (else
  439. (assertion-violation 'expression-map "got a non-expression" exp)))))
  440. ; As above, this returns #F if FUNCTION leaves EXPS unchanged.
  441. (define (expression-list-map function exps)
  442. (let recur ((exps exps))
  443. (if (null? exps)
  444. #f
  445. (let ((new (expression-map function (car exps)))
  446. (more (recur (cdr exps))))
  447. (if (or new more)
  448. (cons (or new (car exps))
  449. (or more (cdr exps)))
  450. #f)))))
  451. ;----------------
  452. ; Translation to a POSIX regexp string.
  453. ;
  454. ; We get the string from EXP->STRINGS as a list of substrings to concatenate
  455. ; together. In this list (<id>) represents a #\( that begins a submatch with
  456. ; the given id and #F represents a #\( that does not begin a submatch. We walk
  457. ; down the list replacing these with #\( and collecting the ids and #f's into a
  458. ; list. When matching this list will be used to discard the match records that
  459. ; correspond to parens used for grouping while retaining those that correspond
  460. ; to submatches.
  461. ;
  462. ; This used to signal an error if a beginning-of-line or end-of-line occured
  463. ; in an unmatchable position. This is a questionable notion and the checking
  464. ; was not completely accurate, so I removed it.
  465. (define (exp->posix-string exp)
  466. (cond ((not (regexp? exp))
  467. (assertion-violation 'exp->posix-string "invalid argument" exp))
  468. ((empty-set? exp)
  469. (assertion-violation 'exp->posix-string "no Posix string for the empty set" exp))
  470. (else
  471. (reduce ((list* elt (exp->strings exp '())))
  472. ((strings '())
  473. (parens '()))
  474. (cond ((not elt)
  475. (values (cons "(" strings) (cons #f parens)))
  476. ((pair? elt)
  477. (values (cons "(" strings) (cons elt parens)))
  478. (else
  479. (values (cons elt strings) parens)))
  480. (values (apply string-append (reverse strings))
  481. (reverse parens))))))
  482. ; submatch -> (...), no need to parenthesize the subexp
  483. ; one-of -> ... | ... | ..., no need to parenthesize the subexps
  484. ; seq -> concatenation, need to parenthesize one-ofs
  485. ; repeat -> ...{x,y}, parenthesize sequences and one-ofs
  486. ;
  487. ; TAIL is a list of strings to come after those for EXP.
  488. (define (exp->strings exp tail)
  489. (cond ((set? exp)
  490. (cons (set->posix-string exp) tail))
  491. ((submatch? exp)
  492. `((,(submatch-id exp))
  493. ,@(exp->strings (submatch-exp exp) '())
  494. ")"
  495. . ,tail))
  496. ((one-of? exp)
  497. (one-of-strings (one-of-exps exp) tail))
  498. ((sequence? exp)
  499. (sequence-strings (sequence-exps exp) tail))
  500. ((repeat? exp)
  501. (repetition-strings (repeat-low exp)
  502. (repeat-high exp)
  503. (repeat-exp exp)
  504. tail))
  505. ((string-start? exp)
  506. (cons "^" tail))
  507. ((string-end? exp)
  508. (cons "$" tail))
  509. (else
  510. (assertion-violation 'exp->strings "bad expression" exp))))
  511. ; Add parentheses around the strings for EXP, encoding "(" as #F because it
  512. ; does not begin a submatch.
  513. (define (exp->parenthesized-strings exp tail)
  514. (cons #f (exp->strings exp (cons ")" tail))))
  515. ; Convert EXPS to strings, adding "|" between them. This depends on there
  516. ; being at least two EXPS.
  517. (define (one-of-strings exps tail)
  518. (let ((exps (reverse exps)))
  519. (reduce ((list* exp (cdr exps)))
  520. ((res (exp->strings (car exps) tail)))
  521. (exp->strings exp (cons "|" res)))))
  522. ; Convert EXPS to strings, adding parentheses for any that are ONE-OFs.
  523. (define (sequence-strings exps tail)
  524. (if (null? exps)
  525. `(#f ")" . ,tail)
  526. (reduce ((list* exp (reverse exps)))
  527. ((strings tail))
  528. (if (one-of? exp)
  529. (exp->parenthesized-strings exp strings)
  530. (exp->strings exp strings)))))
  531. ; e{x,} X or more Es
  532. ; e{x} exactly X Es
  533. ; e{x,y} between X and Y Es, inclusive
  534. (define (repetition-strings low high subexp tail)
  535. (let ((tail `("{"
  536. ,(number->string low)
  537. ,@(cond ((not high)
  538. (list ","))
  539. ((= low high)
  540. '())
  541. (else
  542. (list "," (number->string high))))
  543. "}"
  544. . ,tail)))
  545. (if (or (one-of? subexp)
  546. (sequence? subexp))
  547. (exp->parenthesized-strings subexp tail)
  548. (exp->strings subexp tail))))
  549. ;----------------
  550. ; Converting a set into POSIX regexp [...] sets. We have to avoid various
  551. ; pitfalls of the notation, which does not use a simple escape mechanism.
  552. ; The operator characters lose their meaning when in particular positions:
  553. ; ^ when first indicates negation, anywhere else it's normal
  554. ; ] is normal when first or second after ^
  555. ; - is normal as the first or last character
  556. ; [ is normal except when followed by any one of three characters: .=:
  557. ;
  558. ; 1. If it is everything, use `.'
  559. ; 2. If it a singleton just use the character, escaped if necessary.
  560. ; 3. If the set is -^ then use "[-^]".
  561. ; 4. If - is in the set and not inside a range, then put it last.
  562. ; 5. If ^ is " , then put it before ^^^.
  563. ; 6. If [ is " , then put it before ^^^.
  564. ; 7. If ] is " , then put it first.
  565. ;
  566. ; The code would be simpler if we didn't preserve ranges across funny
  567. ; characters.
  568. ;
  569. ; The computation is expensive enough that we cache the result. This is a
  570. ; win for the named sets (alphabetic etc.) and will also help when people use
  571. ; subexpressions multiple times.
  572. (define (set->posix-string set)
  573. (or (set-string set)
  574. (let ((string (bit-set->posix-string (set-use-case set))))
  575. (set-set-string! set string)
  576. string)))
  577. ; Pick off empty sets and full sets, then negate if necessary and build the
  578. ; string. MAYBE-BIT-SET->STRING returns a character if the set is a singleton.
  579. (define (bit-set->posix-string bit-set)
  580. (cond ((= bit-set 0)
  581. (assertion-violation 'bit-set->posix-string "trying to convert the empty set"))
  582. ((= (bitwise-and bit-set all-chars)
  583. all-chars)
  584. ".")
  585. (else
  586. (let* ((string (maybe-bit-set->string bit-set)))
  587. (if (string? string)
  588. (string-append "["
  589. (if (char? string)
  590. (list->string (list string))
  591. string)
  592. "]")
  593. (char->posix-string string))))))
  594. ; The general rule does not work for "[-^]" (it would come out as "[^-]").
  595. (define dash-hat (bitwise-ior (char->mask #\-)
  596. (char->mask #\^)))
  597. ; This is a list of three element lists:
  598. ; - a bitmask for a character that has special meaning
  599. ; - the character itself
  600. ; - a mask with the character and two or three adjacent characters
  601. ; If the character is present but not all of the bigger mask, then the character
  602. ; will not be inside a range and so must be removed and placed at the end of the
  603. ; string. This is complicated by #\] and #\^ having adjacent scalar values.
  604. ; If one is present then the four characters "\]^_" must be present to make sure
  605. ; the funny character is inside a range.
  606. ;
  607. ; The bitmask comes first so that we can use ASSOC to search the list.
  608. (define funny-char-list
  609. (map (lambda (char)
  610. (let ((mask (char->mask char)))
  611. (list mask
  612. char
  613. (bitwise-ior mask
  614. (arithmetic-shift mask -1)
  615. (arithmetic-shift mask 1)
  616. (cond ((eq? char #\^)
  617. (char->mask #\\))
  618. ((eq? char #\])
  619. (char->mask #\_))
  620. (else
  621. 0))))))
  622. '(#\[ #\^ #\- #\])))
  623. ; Pick off "-^" and any funny characters on their own. Otherwise we remove
  624. ; any funny characters that will not be internal to a range and then add them
  625. ; back in at the end. The order of the funny character data makes it so that
  626. ; any subset of them is an unambiguous end for the set (except for #\] which
  627. ; goes at the beginning).
  628. (define (maybe-bit-set->string bit-set)
  629. (cond ((= bit-set dash-hat)
  630. "-^")
  631. ((assoc bit-set funny-char-list)
  632. => cadr)
  633. (else
  634. (reduce ((list* info funny-char-list))
  635. ((bit-set bit-set)
  636. (funny-chars '()))
  637. (if (or (= 0
  638. (bitwise-and bit-set
  639. (car info)))
  640. (= (caddr info)
  641. (bitwise-and bit-set
  642. (caddr info))))
  643. (values bit-set
  644. funny-chars)
  645. (values (bitwise-xor bit-set (car info))
  646. (cons (cadr info) funny-chars)))
  647. (let ((chars (bit-set->chars bit-set)))
  648. (char-list->string (if (and (not (null? funny-chars))
  649. (eq? (car funny-chars) #\]))
  650. (append (cdr funny-chars)
  651. (reverse (cons #\] chars)))
  652. (append funny-chars
  653. (reverse chars)))))))))
  654. ; Returns a list of pairs (char0 . char1), each pair representing an inclusive
  655. ; range of characters in the bitset. We first make a list of possibly adjacent
  656. ; ranges and then merge the adjacent ones. Believe it or not, the code is
  657. ; much simpler this way.
  658. (define (bit-set->chars bit-set)
  659. (reduce ((bits* chunk bit-set 16)
  660. (count* i 0 -1 16))
  661. ((ranges '()))
  662. (if (= 0 chunk)
  663. ranges
  664. (small-bit-set->ranges chunk i ranges))
  665. (reduce ((list* range (cdr ranges)))
  666. ((done (list (car ranges))))
  667. (if (= (+ (cdr range) 1)
  668. (caar done))
  669. (cons (cons (car range) (cdar done))
  670. (cdr done))
  671. (cons range done)))))
  672. ; Here we loop through the bits creating ranges. REDUCE doesn't work because
  673. ; we need both the next bit and the remaining bit-set.
  674. (define (small-bit-set->ranges bit-set i result)
  675. (let loop ((bit-set bit-set) (i i))
  676. (cond ((= 0 bit-set)
  677. result)
  678. ((odd? bit-set)
  679. (let range-loop ((bit-set (shift-down bit-set)) (j (+ i 1)))
  680. (if (odd? bit-set)
  681. (range-loop (shift-down bit-set) (+ j 1))
  682. (small-bit-set->ranges (shift-down bit-set)
  683. (+ j 1)
  684. (cons (cons i (- j 1))
  685. result)))))
  686. (else
  687. (loop (shift-down bit-set) (+ i 1))))))
  688. (define (shift-down n)
  689. (arithmetic-shift n -1))
  690. ; Turn RANGES, which is list of pairs and
  691. ; (<scalar value start> . <scalar value end>) ranges
  692. ; into a string, where the ranges become <start char>-<end char>. Any
  693. ; characters in RANGES are put directly into the result.
  694. (define (char-list->string ranges)
  695. (if (and (null? (cdr ranges))
  696. (= (caar ranges)
  697. (cdar ranges)))
  698. (string (scalar-value->char (caar ranges)))
  699. (reduce ((list* range ranges))
  700. ((res '()))
  701. (if (char? range)
  702. (cons range res)
  703. (let ((first (scalar-value->char (car range)))
  704. (last (scalar-value->char (cdr range))))
  705. (case (- (cdr range) (car range))
  706. ((0)
  707. (cons first res))
  708. ((1)
  709. (cons first (cons last res)))
  710. (else
  711. `(,first #\- ,last . ,res)))))
  712. (list->string res))))
  713. ; These are the characters that need to be escaped when appearing in an
  714. ; expression (but not necessarily when in a character set).
  715. (define special-char-set
  716. (set-use-case (set "[.*?()|\\$^+")))
  717. ; Add a \ in front of CHAR if it is an operator.
  718. ;
  719. ; To avoid sequences like "{3", which look like parts of ranges, we always put
  720. ; #\{ characters in ranges.
  721. (define (char->posix-string char)
  722. (cond ((char=? char #\{)
  723. "[{]")
  724. ((= 0 (bitwise-and (char->mask char)
  725. special-char-set))
  726. (string char))
  727. (else
  728. (string #\\ char))))
  729. ;----------------
  730. ; Matching
  731. ;
  732. ; A compiled regexp is actually a pair of a POSIX regexp object and the list
  733. ; of submatch indexes.
  734. (define (any-match? exp string)
  735. (if (and (regexp? exp)
  736. (string? string))
  737. (if (empty-set? exp)
  738. #f
  739. (regexp-match (car (compile-exp exp)) string 0 #f #t #t))
  740. (assertion-violation 'any-match? "invalid argument" exp string)))
  741. (define (exact-match? exp string)
  742. (if (and (regexp? exp)
  743. (string? string))
  744. (if (empty-set? exp)
  745. #f
  746. (let ((matches (regexp-match (car (compile-exp exp)) string 0 #t #t #t)))
  747. (and matches
  748. (= 0 (match-start (car matches)))
  749. (= (string-length string) (match-end (car matches))))))
  750. (assertion-violation 'exact-match? "invalid argument" exp string)))
  751. ; Do the match and select out the match records that correspond to submatches,
  752. ; making them into an alist. The first match record, which is for the entire
  753. ; expression is remade with the alist as an additional field.
  754. (define (match exp string)
  755. (if (and (regexp? exp)
  756. (string? string))
  757. (let* ((pair (compile-exp exp))
  758. (regexp (car pair))
  759. (match-flags (cdr pair))
  760. (matches (regexp-match regexp string 0 #t #t #t)))
  761. (if matches
  762. (reduce ((list% match (cdr matches))
  763. (list% flag match-flags))
  764. ((submatches '()))
  765. (if (and flag match)
  766. (cons (cons (car flag)
  767. match)
  768. submatches)
  769. submatches)
  770. (make-match (match-start (car matches))
  771. (match-end (car matches))
  772. (reverse submatches)))
  773. #f))
  774. (assertion-violation 'match "invalid argument" exp string)))
  775. ; Compile the expression if this hasn't already been done. The compiled version
  776. ; is the POSIX regexp object and the list of submatch indexes.
  777. (define (compile-exp exp)
  778. (or (regexp-compiled exp)
  779. (mvlet* (((string match-flags)
  780. (exp->posix-string exp))
  781. ;; must always have submatches on, needed by EXACT-MATCH?
  782. (regexp (make-posix-regexp string
  783. (regexp-option extended)
  784. (regexp-option submatches))))
  785. (set-regexp-compiled! exp (cons regexp match-flags))
  786. (cons regexp match-flags))))
  787. ;----------------
  788. ; A handy debugging function. In theory this returns as S-expression that
  789. ; when EVAL'ed would give back the original expression. It does not produce
  790. ; a particularly compact s-expression.
  791. (define (regexp->s-exp x)
  792. (cond ((not (regexp? x))
  793. (assertion-violation 'regexp->s-exp "invalid argument" x))
  794. ((set? x)
  795. (list 'set
  796. (let ((chars (set->chars x)))
  797. (if (= 1 (length chars))
  798. (car chars)
  799. (list->string chars)))))
  800. ((submatch? x)
  801. (list 'submatch (regexp->s-exp (submatch-exp x))))
  802. ((sequence? x)
  803. (cons 'sequence (map regexp->s-exp (sequence-exps x))))
  804. ((one-of? x)
  805. (cons 'one-of (map regexp->s-exp (one-of-exps x))))
  806. ((repeat? x)
  807. (list 'repeat
  808. (repeat-low x)
  809. (repeat-high x)
  810. (regexp->s-exp (repeat-exp x))))
  811. ((string-start? x)
  812. '(string-start))
  813. ((string-end? x)
  814. '(string-end))
  815. (else
  816. (assertion-violation 'regexp->s-exp "unknown type of regular-expression" x))))
  817. ; Used by EXP->S-EXP. Returns a list of the characters in SET (using the
  818. ; case-sensitive set).
  819. (define (set->chars set)
  820. (iterate loop ((count* i 0 -1 16))
  821. ((bits (set-use-case set))
  822. (chars '()))
  823. (if (= 0 bits)
  824. (reverse chars)
  825. (loop (arithmetic-shift bits -16)
  826. (iterate loop ((count* i i))
  827. ((bits (bitwise-and bits #xffff))
  828. (chars chars))
  829. (if (= 0 bits)
  830. chars
  831. (loop (arithmetic-shift bits -1)
  832. (if (odd? bits)
  833. (cons (scalar-value->char i)
  834. chars)
  835. chars))))))))