srfi-14.scm 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830
  1. ; Copyright (c) 1993-2007 by Richard Kelsey and Jonathan Rees. See file COPYING.
  2. ; Copyright (c) 2005-2006 by Basis Technology Corporation. See file COPYING.
  3. ; This is basically a complete re-implementation, suitable for Unicode.
  4. ; Some bits and pieces from Olin's reference implementation remain,
  5. ; but none from the MIT Scheme code. For whatever remains, the
  6. ; following copyright holds:
  7. ; Copyright (c) 1994-2003 by Olin Shivers
  8. ;
  9. ; All rights reserved.
  10. ;
  11. ; Redistribution and use in source and binary forms, with or without
  12. ; modification, are permitted provided that the following conditions
  13. ; are met:
  14. ; 1. Redistributions of source code must retain the above copyright
  15. ; notice, this list of conditions and the following disclaimer.
  16. ; 2. Redistributions in binary form must reproduce the above copyright
  17. ; notice, this list of conditions and the following disclaimer in the
  18. ; documentation and/or other materials provided with the distribution.
  19. ; 3. The name of the authors may not be used to endorse or promote products
  20. ; derived from this software without specific prior written permission.
  21. ;
  22. ; THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
  23. ; IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  24. ; OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  25. ; IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26. ; INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  27. ; NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28. ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29. ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30. ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  31. ; THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32. (define-record-type :char-set
  33. (make-char-set simple i-list)
  34. char-set?
  35. ;; byte vector for the Latin-1 part
  36. (simple char-set-simple
  37. set-char-set-simple!)
  38. ;; inversion list for the rest
  39. (i-list char-set-i-list
  40. set-char-set-i-list!))
  41. (define (make-char-set-immutable! char-set)
  42. (make-immutable! char-set)
  43. (make-immutable! (char-set-simple char-set)))
  44. ; inversion lists are always immutable
  45. ;;; "Simple Csets"---we use mutable byte vectors for the Latin-1 part
  46. (define *simple-cset-boundary* 256)
  47. (define (simple-char? c)
  48. (< (char->scalar-value c) *simple-cset-boundary*))
  49. (define (make-empty-simple-cset)
  50. (make-byte-vector *simple-cset-boundary* 0))
  51. (define (make-full-simple-cset)
  52. (make-byte-vector *simple-cset-boundary* 1))
  53. (define (copy-simple-cset s)
  54. (byte-vector-copy s))
  55. ; don't mistake these for abstractions
  56. (define (simple-cset-code-not-member? s i) (zero? (byte-vector-ref s i)))
  57. (define (simple-cset-code-member? s i) (not (simple-cset-code-not-member? s i)))
  58. (define (simple-cset-ref s i) (byte-vector-ref s i))
  59. (define (simple-cset-set! s i v) (byte-vector-set! s i v))
  60. (define (simple-cset-remove-code! s i) (byte-vector-set! s i 0))
  61. (define (simple-cset-adjoin-code! s i) (byte-vector-set! s i 1))
  62. (define (simple-cset-contains? s char)
  63. (simple-cset-code-member? s (char->scalar-value char)))
  64. (define (simple-cset=? s1 s2)
  65. (byte-vector=? s1 s2))
  66. (define (simple-cset<=? s1 s2)
  67. (or (eq? s1 s2)
  68. (let loop ((i 0))
  69. (if (>= i *simple-cset-boundary*)
  70. #t
  71. (and (<= (simple-cset-ref s1 i) (simple-cset-ref s2 i))
  72. (loop (+ 1 i)))))))
  73. (define (simple-cset-size s)
  74. (let loop ((i 0) (size 0))
  75. (if (>= i *simple-cset-boundary*)
  76. size
  77. (loop (+ 1 i) (+ size (simple-cset-ref s i))))))
  78. (define (simple-cset-count pred s)
  79. (let loop ((i 0) (count 0))
  80. (if (>= i *simple-cset-boundary*)
  81. count
  82. (loop (+ 1 i)
  83. (if (and (simple-cset-code-member? s i) (pred (scalar-value->char i)))
  84. (+ count 1)
  85. count)))))
  86. (define (simple-cset-modify! set s chars)
  87. (for-each (lambda (c) (set s (char->scalar-value c)))
  88. chars)
  89. s)
  90. (define (simple-cset-modify set s chars)
  91. (simple-cset-modify! set (copy-simple-cset s) chars))
  92. (define (simple-cset-adjoin s . chars)
  93. (simple-cset-modify simple-cset-adjoin-code! s chars))
  94. (define (simple-cset-adjoin! s . chars)
  95. (simple-cset-modify! simple-cset-adjoin-code! s chars))
  96. (define (simple-cset-delete s . chars)
  97. (simple-cset-modify simple-cset-remove-code! s chars))
  98. (define (simple-cset-delete! s . chars)
  99. (simple-cset-modify! simple-cset-remove-code! s chars))
  100. ;;; If we represented char sets as a bit set, we could do the following
  101. ;;; trick to pick the lowest bit out of the set:
  102. ;;; (count-bits (xor (- cset 1) cset))
  103. ;;; (But first mask out the bits already scanned by the cursor first.)
  104. (define (simple-cset-cursor-next s cursor)
  105. (let loop ((cur cursor))
  106. (let ((cur (- cur 1)))
  107. (if (or (< cur 0) (simple-cset-code-member? s cur))
  108. cur
  109. (loop cur)))))
  110. (define (end-of-simple-cset? cursor)
  111. (negative? cursor))
  112. (define (simple-cset-cursor-ref cursor)
  113. (scalar-value->char cursor))
  114. (define (simple-cset-for-each proc s)
  115. (let loop ((i 0))
  116. (if (< i *simple-cset-boundary*)
  117. (begin
  118. (if (simple-cset-code-member? s i)
  119. (proc (scalar-value->char i)))
  120. (loop (+ 1 i))))))
  121. (define (simple-cset-fold kons knil s)
  122. (let loop ((i 0) (ans knil))
  123. (if (>= i *simple-cset-boundary*)
  124. ans
  125. (loop (+ 1 i)
  126. (if (simple-cset-code-not-member? s i)
  127. ans
  128. (kons (scalar-value->char i) ans))))))
  129. (define (simple-cset-every? pred s)
  130. (let loop ((i 0))
  131. (cond
  132. ((>= i *simple-cset-boundary*)
  133. #t)
  134. ((or (simple-cset-code-not-member? s i)
  135. (pred (scalar-value->char i)))
  136. (loop (+ 1 i)))
  137. (else
  138. #f))))
  139. (define (simple-cset-any pred s)
  140. (let loop ((i 0))
  141. (cond
  142. ((>= i *simple-cset-boundary*) #f)
  143. ((and (simple-cset-code-member? s i)
  144. (pred (scalar-value->char i))))
  145. (else
  146. (loop (+ 1 i))))))
  147. (define (ucs-range->simple-cset lower upper)
  148. (let ((s (make-empty-simple-cset)))
  149. (let loop ((i lower))
  150. (if (< i upper)
  151. (begin
  152. (simple-cset-adjoin-code! s i)
  153. (loop (+ 1 i)))))
  154. s))
  155. ; Algebra
  156. ; These do various "s[i] := s[i] op val" operations
  157. (define (simple-cset-invert-code! s i v)
  158. (simple-cset-set! s i (- 1 v)))
  159. (define (simple-cset-and-code! s i v)
  160. (if (zero? v)
  161. (simple-cset-remove-code! s i)))
  162. (define (simple-cset-or-code! s i v)
  163. (if (not (zero? v))
  164. (simple-cset-adjoin-code! s i)))
  165. (define (simple-cset-minus-code! s i v)
  166. (if (not (zero? v))
  167. (simple-cset-remove-code! s i)))
  168. (define (simple-cset-xor-code! s i v)
  169. (if (not (zero? v))
  170. (simple-cset-set! s i (- 1 (simple-cset-ref s i)))))
  171. (define (simple-cset-complement s)
  172. (simple-cset-complement! (copy-simple-cset s)))
  173. (define (simple-cset-complement! s)
  174. (byte-vector-iter (lambda (i v) (simple-cset-invert-code! s i v)) s)
  175. s)
  176. (define (simple-cset-op! s simple-csets code-op!)
  177. (for-each (lambda (s2)
  178. (let loop ((i 0))
  179. (if (< i *simple-cset-boundary*)
  180. (begin
  181. (code-op! s i (simple-cset-ref s2 i))
  182. (loop (+ 1 i))))))
  183. simple-csets)
  184. s)
  185. (define (simple-cset-union! s1 . ss)
  186. (simple-cset-op! s1 ss simple-cset-or-code!))
  187. (define (simple-cset-union . ss)
  188. (if (pair? ss)
  189. (apply simple-cset-union!
  190. (byte-vector-copy (car ss))
  191. (cdr ss))
  192. (make-empty-simple-cset)))
  193. (define (simple-cset-intersection! s1 . ss)
  194. (simple-cset-op! s1 ss simple-cset-and-code!))
  195. (define (simple-cset-intersection . ss)
  196. (if (pair? ss)
  197. (apply simple-cset-intersection!
  198. (byte-vector-copy (car ss))
  199. (cdr ss))
  200. (make-full-simple-cset)))
  201. (define (simple-cset-difference! s1 . ss)
  202. (simple-cset-op! s1 ss simple-cset-minus-code!))
  203. (define (simple-cset-difference s1 . ss)
  204. (if (pair? ss)
  205. (apply simple-cset-difference! (copy-simple-cset s1) ss)
  206. (copy-simple-cset s1)))
  207. (define (simple-cset-xor! s1 . ss)
  208. (simple-cset-op! s1 ss simple-cset-xor-code!))
  209. (define (simple-cset-xor . ss)
  210. (if (pair? ss)
  211. (apply simple-cset-xor!
  212. (byte-vector-copy (car ss))
  213. (cdr ss))
  214. (make-empty-simple-cset)))
  215. (define (simple-cset-diff+intersection! s1 s2 . ss)
  216. (byte-vector-iter (lambda (i v)
  217. (cond
  218. ((zero? v)
  219. (simple-cset-remove-code! s2 i))
  220. ((simple-cset-code-member? s2 i)
  221. (simple-cset-remove-code! s1 i))))
  222. s1)
  223. (for-each (lambda (s)
  224. (byte-vector-iter (lambda (i v)
  225. (if (and (not (zero? v))
  226. (simple-cset-code-member? s1 i))
  227. (begin
  228. (simple-cset-remove-code! s1 i)
  229. (simple-cset-adjoin-code! s2 i))))
  230. s))
  231. ss)
  232. (values s1 s2))
  233. ; Compute (c + 37 c + 37^2 c + ...) modulo BOUND, with sleaze thrown
  234. ; in to keep the intermediate values small. (We do the calculation
  235. ; with just enough bits to represent BOUND, masking off high bits at
  236. ; each step in calculation. If this screws up any important properties
  237. ; of the hash function I'd like to hear about it. -Olin)
  238. (define (simple-cset-hash s bound)
  239. ;; The mask that will cover BOUND-1:
  240. (let ((mask (let loop ((i #x10000)) ; Let's skip first 16 iterations, eh?
  241. (if (>= i bound) (- i 1) (loop (+ i i))))))
  242. (let loop ((i (- *simple-cset-boundary* 1)) (ans 0))
  243. (if (< i 0)
  244. (modulo ans bound)
  245. (loop (- i 1)
  246. (if (simple-cset-code-not-member? s i)
  247. ans
  248. (bitwise-and mask (+ (* 37 ans) i))))))))
  249. ;;; Now for the real character sets
  250. (define (make-empty-char-set)
  251. (make-char-set (make-empty-simple-cset)
  252. (make-empty-inversion-list *simple-cset-boundary* (+ 1 #x10ffff))))
  253. (define (make-full-char-set)
  254. (make-char-set (make-full-simple-cset)
  255. (range->inversion-list *simple-cset-boundary* (+ 1 #x10ffff)
  256. *simple-cset-boundary* (+ 1 #x10ffff))))
  257. (define (char-set-copy cs)
  258. (make-char-set (copy-simple-cset (char-set-simple cs))
  259. (inversion-list-copy (char-set-i-list cs))))
  260. ; n-ary version
  261. (define (char-set= . rest)
  262. (or (null? rest)
  263. (let ((cs1 (car rest))
  264. (rest (cdr rest)))
  265. (let loop ((rest rest))
  266. (or (not (pair? rest))
  267. (and (char-set=/2 cs1 (car rest))
  268. (loop (cdr rest))))))))
  269. ; binary version
  270. (define (char-set=/2 cs-1 cs-2)
  271. (and (simple-cset=? (char-set-simple cs-1) (char-set-simple cs-2))
  272. (inversion-list=? (char-set-i-list cs-1)
  273. (char-set-i-list cs-2))))
  274. ; n-ary
  275. (define (char-set<= . rest)
  276. (or (null? rest)
  277. (let ((cs1 (car rest))
  278. (rest (cdr rest)))
  279. (let loop ((cs1 cs1) (rest rest))
  280. (or (not (pair? rest))
  281. (and (char-set<=/2 cs1 (car rest))
  282. (loop (car rest) (cdr rest))))))))
  283. ; binary
  284. (define (char-set<=/2 cs-1 cs-2)
  285. (and (simple-cset<=? (char-set-simple cs-1) (char-set-simple cs-2))
  286. (inversion-list<=? (char-set-i-list cs-1)
  287. (char-set-i-list cs-2))))
  288. (define (inversion-list<=? i-list-1 i-list-2)
  289. (inversion-list=? i-list-1
  290. (inversion-list-intersection i-list-1 i-list-2)))
  291. ;;; Hash
  292. ; We follow Olin's reference implementation:
  293. ;
  294. ; If you keep BOUND small enough, the intermediate calculations will
  295. ; always be fixnums. How small is dependent on the underlying Scheme system;
  296. ; we use a default BOUND of 2^22 = 4194304, which should hack it in
  297. ; Schemes that give you at least 29 signed bits for fixnums. The core
  298. ; calculation that you don't want to overflow is, worst case,
  299. ; (+ 65535 (* 37 (- bound 1)))
  300. ; where 65535 is the max character code. Choose the default BOUND to be the
  301. ; biggest power of two that won't cause this expression to fixnum overflow,
  302. ; and everything will be copacetic.
  303. (define char-set-hash
  304. (opt-lambda (cs (bound 4194304))
  305. (if (not (and (integer? bound)
  306. (exact? bound)
  307. (<= 0 bound)))
  308. (call-error "invalid bound" char-set-hash bound))
  309. (let ((bound (if (zero? bound) 4194304 bound)))
  310. (modulo (+ (simple-cset-hash (char-set-simple cs) bound)
  311. (* 37 (inversion-list-hash (char-set-i-list cs) bound)))
  312. bound))))
  313. (define (char-set-contains? cs char)
  314. (if (simple-char? char)
  315. (simple-cset-contains? (char-set-simple cs) char)
  316. (inversion-list-member? (char->scalar-value char)
  317. (char-set-i-list cs))))
  318. (define (char-set-size cs)
  319. (+ (simple-cset-size (char-set-simple cs))
  320. (inversion-list-size (char-set-i-list cs))))
  321. (define (char-set-count pred cset)
  322. (+ (simple-cset-count pred (char-set-simple cset))
  323. (inversion-list-count pred (char-set-i-list cset))))
  324. (define (inversion-list-count pred i-list)
  325. (inversion-list-fold/done? (lambda (v count)
  326. (if (pred (scalar-value->char v))
  327. (+ 1 count)
  328. count))
  329. 0
  330. (lambda (v) #f)
  331. i-list))
  332. (define (make-char-set-char-op simple-cset-op inversion-list-op)
  333. (lambda (cs . chars)
  334. (call-with-values
  335. (lambda () (partition-list simple-char? chars))
  336. (lambda (simple-chars non-simple-chars)
  337. (make-char-set (apply simple-cset-op (char-set-simple cs) simple-chars)
  338. (apply inversion-list-op (char-set-i-list cs)
  339. (map char->scalar-value non-simple-chars)))))))
  340. (define (make-char-set-char-op! simple-cset-op! simple-cset-op
  341. inversion-list-op)
  342. (lambda (cs . chars)
  343. (call-with-values
  344. (lambda () (partition-list simple-char? chars))
  345. (lambda (simple-chars non-simple-chars)
  346. (if (null? non-simple-chars)
  347. (apply simple-cset-op! (char-set-simple cs) simple-chars)
  348. (begin
  349. (set-char-set-simple! cs
  350. (apply simple-cset-op (char-set-simple cs)
  351. simple-chars))
  352. (set-char-set-i-list! cs
  353. (apply inversion-list-op (char-set-i-list cs)
  354. (map char->scalar-value non-simple-chars)))))))
  355. cs))
  356. (define char-set-adjoin
  357. (make-char-set-char-op simple-cset-adjoin inversion-list-adjoin))
  358. (define char-set-adjoin!
  359. (make-char-set-char-op! simple-cset-adjoin! simple-cset-adjoin
  360. inversion-list-adjoin))
  361. (define char-set-delete
  362. (make-char-set-char-op simple-cset-delete inversion-list-remove))
  363. (define char-set-delete!
  364. (make-char-set-char-op! simple-cset-delete! simple-cset-delete
  365. inversion-list-remove))
  366. ;;; Cursors
  367. ; A cursor is either an integer index into the mark vector (-1 for the
  368. ; end-of-char-set cursor) as in the reference implementation, and an
  369. ; inversion-list cursor otherwise.
  370. (define (char-set-cursor cset)
  371. (let ((simple-cursor
  372. (simple-cset-cursor-next (char-set-simple cset)
  373. *simple-cset-boundary*)))
  374. (if (end-of-simple-cset? simple-cursor)
  375. (inversion-list-cursor (char-set-i-list cset))
  376. simple-cursor)))
  377. (define (end-of-char-set? cursor)
  378. (and (inversion-list-cursor? cursor)
  379. (inversion-list-cursor-at-end? cursor)))
  380. (define (char-set-ref cset cursor)
  381. (if (number? cursor)
  382. (simple-cset-cursor-ref cursor)
  383. (scalar-value->char (inversion-list-cursor-ref cursor))))
  384. (define (char-set-cursor-next cset cursor)
  385. (cond
  386. ((number? cursor)
  387. (let ((next (simple-cset-cursor-next (char-set-simple cset) cursor)))
  388. (if (end-of-simple-cset? next)
  389. (inversion-list-cursor (char-set-i-list cset))
  390. next)))
  391. (else
  392. (inversion-list-cursor-next (char-set-i-list cset) cursor))))
  393. (define (char-set-for-each proc cs)
  394. (simple-cset-for-each proc (char-set-simple cs))
  395. (inversion-list-fold/done? (lambda (n _)
  396. (proc (scalar-value->char n))
  397. (unspecific))
  398. #f
  399. (lambda (_) #f)
  400. (char-set-i-list cs)))
  401. ; this is pretty inefficent
  402. (define (char-set-map proc cs)
  403. (let ((simple-cset (make-empty-simple-cset))
  404. (other-scalar-values '()))
  405. (define (adjoin! c)
  406. (let ((c (proc c)))
  407. (if (simple-char? c)
  408. (simple-cset-adjoin! simple-cset c)
  409. (set! other-scalar-values
  410. (cons (char->scalar-value c) other-scalar-values)))))
  411. (char-set-for-each adjoin! cs)
  412. (make-char-set simple-cset
  413. (apply numbers->inversion-list
  414. *simple-cset-boundary* (+ 1 #x10ffff)
  415. other-scalar-values))))
  416. (define (char-set-fold kons knil cs)
  417. (inversion-list-fold/done? (lambda (n v)
  418. (kons (scalar-value->char n) v))
  419. (simple-cset-fold kons knil (char-set-simple cs))
  420. (lambda (_) #f)
  421. (char-set-i-list cs)))
  422. (define (char-set-every pred cs)
  423. (and (simple-cset-every? pred (char-set-simple cs))
  424. (inversion-list-fold/done? (lambda (n v)
  425. (and v
  426. (pred (scalar-value->char n))))
  427. #t
  428. not
  429. (char-set-i-list cs))))
  430. (define (char-set-any pred cs)
  431. (or (simple-cset-any pred (char-set-simple cs))
  432. (inversion-list-fold/done? (lambda (n v)
  433. (or v
  434. (pred (scalar-value->char n))))
  435. #f
  436. values
  437. (char-set-i-list cs))))
  438. (define (base-char-set maybe-base-cs)
  439. (if maybe-base-cs
  440. (char-set-copy maybe-base-cs)
  441. (make-empty-char-set)))
  442. (define char-set-unfold
  443. (opt-lambda (p f g seed (maybe-base-cs #f))
  444. (char-set-unfold! p f g seed
  445. (base-char-set maybe-base-cs))))
  446. (define (char-set-unfold! p f g seed base-cs)
  447. (let loop ((seed seed) (cs base-cs))
  448. (if (p seed) cs ; P says we are done.
  449. (loop (g seed) ; Loop on (G SEED).
  450. (char-set-adjoin! cs (f seed)))))) ; Add (F SEED) to set.
  451. ; converting from and to lists
  452. (define (char-set . chars)
  453. (list->char-set chars))
  454. (define list->char-set
  455. (opt-lambda (chars (maybe-base-cs #f))
  456. (list->char-set! chars
  457. (base-char-set maybe-base-cs))))
  458. (define (list->char-set! chars cs)
  459. (for-each (lambda (c)
  460. (char-set-adjoin! cs c))
  461. chars)
  462. cs)
  463. (define (char-set->list cs)
  464. (char-set-fold cons '() cs))
  465. ; converting to and from strings
  466. (define string->char-set
  467. (opt-lambda (str (maybe-base-cs #f))
  468. (string->char-set! str
  469. (base-char-set maybe-base-cs))))
  470. (define (string->char-set! str cs)
  471. (do ((i (- (string-length str) 1) (- i 1)))
  472. ((< i 0))
  473. (char-set-adjoin! cs (string-ref str i)))
  474. cs)
  475. (define (char-set->string cs)
  476. (let ((ans (make-string (char-set-size cs))))
  477. (char-set-fold (lambda (ch i)
  478. (string-set! ans i ch)
  479. (+ i 1))
  480. 0
  481. cs)
  482. ans))
  483. (define ucs-range->char-set
  484. (opt-lambda (lower upper (error? #f) (maybe-base-cs #f))
  485. (ucs-range->char-set! lower upper error?
  486. (base-char-set maybe-base-cs))))
  487. (define (ucs-range->char-set! lower upper error? base-cs)
  488. (if (negative? lower)
  489. (call-error "negative lower bound" ucs-range->char-set! lower))
  490. (if (> lower #x10ffff)
  491. (call-error "invalid lower bound" ucs-range->char-set! lower))
  492. (if (negative? upper)
  493. (call-error "negative upper bound" ucs-range->char-set! upper))
  494. (if (> upper #x110000)
  495. (call-error "invalid lower bound" ucs-range->char-set! upper))
  496. (if (not (<= lower upper))
  497. (call-error "decreasing bounds" ucs-range->char-set! lower upper))
  498. (let ((create-inversion-list
  499. (lambda (lower upper)
  500. (cond
  501. ((and (>= lower #xD800)
  502. (>= #xe000 upper))
  503. (make-empty-inversion-list *simple-cset-boundary* (+ 1 #x10ffff)))
  504. ((<= upper #xe000)
  505. (range->inversion-list *simple-cset-boundary* (+ 1 #x10ffff)
  506. lower (min #xd800 upper)))
  507. ((>= lower #xd800)
  508. (range->inversion-list *simple-cset-boundary* (+ 1 #x10ffff)
  509. (max #xe000 lower) upper))
  510. (else
  511. ;; hole
  512. (ranges->inversion-list *simple-cset-boundary* (+ 1 #x10ffff)
  513. (cons lower #xd800)
  514. (cons #xe000 upper)))))))
  515. (char-set-union!
  516. base-cs
  517. (cond
  518. ((>= lower *simple-cset-boundary*)
  519. (make-char-set (make-empty-simple-cset)
  520. (create-inversion-list lower upper)))
  521. ((< upper *simple-cset-boundary*)
  522. (make-char-set (ucs-range->simple-cset lower upper)
  523. (make-empty-inversion-list *simple-cset-boundary* (+ 1 #x10ffff))))
  524. (else
  525. (make-char-set (ucs-range->simple-cset lower *simple-cset-boundary*)
  526. (create-inversion-list *simple-cset-boundary* upper)))))))
  527. (define char-set-filter
  528. (opt-lambda (predicate domain (maybe-base-cs #f))
  529. (char-set-filter! predicate
  530. domain
  531. (base-char-set maybe-base-cs))))
  532. (define (char-set-filter! predicate domain base-cs)
  533. (char-set-fold (lambda (ch _)
  534. (if (predicate ch)
  535. (char-set-adjoin! base-cs ch)))
  536. (unspecific)
  537. domain)
  538. base-cs)
  539. ; {string, char, char-set, char predicate} -> char-set
  540. ; This is called ->CHAR-SET in the SRFI, but that's not a valid R5RS
  541. ; identifier.
  542. (define (x->char-set x)
  543. (cond ((char-set? x) x)
  544. ((string? x) (string->char-set x))
  545. ((char? x) (char-set x))
  546. (else (call-error "->char-set: Not a charset, string or char." x->char-set))))
  547. ; Set algebra
  548. (define *surrogate-complement-i-list*
  549. (inversion-list-complement
  550. (range->inversion-list *simple-cset-boundary* (+ 1 #x10ffff)
  551. #xd800 #xe000)))
  552. (define (char-set-complement cs)
  553. (make-char-set (simple-cset-complement (char-set-simple cs))
  554. (inversion-list-intersection
  555. (inversion-list-complement (char-set-i-list cs))
  556. *surrogate-complement-i-list*)))
  557. (define (char-set-complement! cs)
  558. (set-char-set-simple! cs
  559. (simple-cset-complement! (char-set-simple cs)))
  560. (set-char-set-i-list! cs
  561. (inversion-list-intersection
  562. (inversion-list-complement (char-set-i-list cs))
  563. *surrogate-complement-i-list*))
  564. cs)
  565. (define (make-char-set-op! simple-cset-op! inversion-list-op)
  566. (lambda (cset1 . csets)
  567. (set-char-set-simple! cset1
  568. (apply simple-cset-op!
  569. (char-set-simple cset1)
  570. (map char-set-simple csets)))
  571. (set-char-set-i-list! cset1
  572. (apply inversion-list-op
  573. (char-set-i-list cset1)
  574. (map char-set-i-list csets)))
  575. cset1))
  576. (define (make-char-set-op char-set-op! make-neutral)
  577. (lambda csets
  578. (if (pair? csets)
  579. (apply char-set-op! (char-set-copy (car csets)) (cdr csets))
  580. (make-neutral))))
  581. (define char-set-union!
  582. (make-char-set-op! simple-cset-union! inversion-list-union))
  583. (define char-set-union
  584. (make-char-set-op char-set-union! make-empty-char-set))
  585. (define char-set-intersection!
  586. (make-char-set-op! simple-cset-intersection! inversion-list-intersection))
  587. (define char-set-intersection
  588. (make-char-set-op char-set-intersection! make-full-char-set))
  589. (define char-set-difference!
  590. (make-char-set-op! simple-cset-difference! inversion-list-difference))
  591. (define (char-set-difference cset1 . csets)
  592. (apply char-set-difference! (char-set-copy cset1) csets))
  593. ; copied from inversion-list.scm
  594. (define (binary->n-ary proc/2)
  595. (lambda (arg-1 . args)
  596. (if (and (pair? args)
  597. (null? (cdr args)))
  598. (proc/2 arg-1 (car args))
  599. (let loop ((args args)
  600. (result arg-1))
  601. (if (null? args)
  602. result
  603. (loop (cdr args) (proc/2 result (car args))))))))
  604. (define inversion-list-xor
  605. (binary->n-ary
  606. (lambda (i-list-1 i-list-2)
  607. (inversion-list-union (inversion-list-intersection
  608. (inversion-list-complement i-list-1)
  609. i-list-2)
  610. (inversion-list-intersection
  611. i-list-1
  612. (inversion-list-complement i-list-2))))))
  613. ; Really inefficient for things outside Latin-1
  614. ; WHO NEEDS THIS NONSENSE, ANYWAY?
  615. (define char-set-xor!
  616. (make-char-set-op! simple-cset-xor! inversion-list-xor))
  617. (define char-set-xor
  618. (make-char-set-op char-set-xor! make-empty-char-set))
  619. (define (char-set-diff+intersection! cs1 cs2 . csets)
  620. (call-with-values
  621. (lambda () (apply simple-cset-diff+intersection!
  622. (char-set-simple cs1) (char-set-simple cs2)
  623. (map char-set-simple csets)))
  624. (lambda (simple-diff simple-intersection)
  625. (set-char-set-simple! cs1 simple-diff)
  626. (set-char-set-simple! cs2 simple-intersection)
  627. (set-char-set-i-list! cs1
  628. (apply inversion-list-difference
  629. (char-set-i-list cs1)
  630. (char-set-i-list cs2)
  631. (map char-set-i-list csets)))
  632. (set-char-set-i-list! cs2 (inversion-list-intersection
  633. (char-set-i-list cs1)
  634. (apply inversion-list-union
  635. (char-set-i-list cs2)
  636. (map char-set-i-list csets))))
  637. (values cs1 cs2))))
  638. (define (char-set-diff+intersection cs1 . csets)
  639. (apply char-set-diff+intersection!
  640. (char-set-copy cs1)
  641. (make-empty-char-set)
  642. csets))
  643. ;; Byte vector utilities
  644. (define (byte-vector-copy b)
  645. (let* ((size (byte-vector-length b))
  646. (result (make-byte-vector size 0)))
  647. (copy-bytes! b 0 result 0 size)
  648. result))
  649. ;;; Apply P to each index and its char code in S: (P I VAL).
  650. ;;; Used by the set-algebra ops.
  651. (define (byte-vector-iter p s)
  652. (let loop ((i (- (byte-vector-length s) 1)))
  653. (if (>= i 0)
  654. (begin
  655. (p i (byte-vector-ref s i))
  656. (loop (- i 1))))))
  657. (define (byte-vector=? b1 b2)
  658. (let ((size-1 (byte-vector-length b1))
  659. (size-2 (byte-vector-length b2)))
  660. (and (= size-1 size-2)
  661. (let loop ((i 0))
  662. (cond
  663. ((>= i size-1) #t)
  664. ((= (byte-vector-ref b1 i) (byte-vector-ref b2 i))
  665. (loop (+ 1 i)))
  666. (else
  667. #f))))))
  668. ;; Utility for srfi-14-base-char-sets.scm, which follows
  669. ; The range vector is an even-sized vector with [lower, upper)
  670. ; pairs.
  671. (define (range-vector->char-set range-vector)
  672. (let ((size (vector-length range-vector))
  673. (simple-cset (make-empty-simple-cset)))
  674. (let loop ((index 0) (ranges '()))
  675. (if (>= index size)
  676. (make-char-set simple-cset
  677. (apply ranges->inversion-list
  678. *simple-cset-boundary* (+ 1 #x10ffff)
  679. ranges))
  680. (let ((lower (vector-ref range-vector index))
  681. (upper (vector-ref range-vector (+ 1 index))))
  682. (define (fill-simple-cset! lower upper)
  683. (let loop ((scalar-value lower))
  684. (if (< scalar-value upper)
  685. (begin
  686. (simple-cset-adjoin-code! simple-cset scalar-value)
  687. (loop (+ 1 scalar-value))))))
  688. (cond
  689. ((>= lower *simple-cset-boundary*)
  690. (loop (+ 2 index) (cons (cons lower upper) ranges)))
  691. ((< upper *simple-cset-boundary*)
  692. (fill-simple-cset! lower upper)
  693. (loop (+ 2 index) ranges))
  694. (else
  695. (fill-simple-cset! lower *simple-cset-boundary*)
  696. (loop (+ 2 index)
  697. (cons (cons *simple-cset-boundary* upper) ranges)))))))))