srfi-14.scm 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829
  1. ; Part of Scheme 48 1.9. See file COPYING for notices and license.
  2. ; Authors: Mike Sperber, Robert Tansom
  3. ; Copyright (c) 2005-2006 by Basis Technology Corporation.
  4. ; This is basically a complete re-implementation, suitable for Unicode.
  5. ; Some bits and pieces from Olin's reference implementation remain,
  6. ; but none from the MIT Scheme code. For whatever remains, the
  7. ; following copyright holds:
  8. ; Copyright (c) 1994-2003 by Olin Shivers
  9. ;
  10. ; All rights reserved.
  11. ;
  12. ; Redistribution and use in source and binary forms, with or without
  13. ; modification, are permitted provided that the following conditions
  14. ; are met:
  15. ; 1. Redistributions of source code must retain the above copyright
  16. ; notice, this list of conditions and the following disclaimer.
  17. ; 2. Redistributions in binary form must reproduce the above copyright
  18. ; notice, this list of conditions and the following disclaimer in the
  19. ; documentation and/or other materials provided with the distribution.
  20. ; 3. The name of the authors may not be used to endorse or promote products
  21. ; derived from this software without specific prior written permission.
  22. ;
  23. ; THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
  24. ; IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  25. ; OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  26. ; IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  27. ; INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  28. ; NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  29. ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  30. ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  31. ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  32. ; THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  33. (define-record-type :char-set
  34. (make-char-set simple i-list)
  35. char-set?
  36. ;; byte vector for the Latin-1 part
  37. (simple char-set-simple
  38. set-char-set-simple!)
  39. ;; inversion list for the rest
  40. (i-list char-set-i-list
  41. set-char-set-i-list!))
  42. (define-record-discloser :char-set
  43. (lambda (cs)
  44. (list 'char-set
  45. (char-set-size cs))))
  46. (define (make-char-set-immutable! char-set)
  47. (make-immutable! char-set)
  48. (make-immutable! (char-set-simple char-set)))
  49. ; inversion lists are always immutable
  50. ;;; "Simple Csets"---we use mutable byte vectors for the Latin-1 part
  51. (define *simple-cset-boundary* 256)
  52. (define (simple-char? c)
  53. (< (char->scalar-value c) *simple-cset-boundary*))
  54. (define (make-empty-simple-cset)
  55. (make-byte-vector *simple-cset-boundary* 0))
  56. (define (make-full-simple-cset)
  57. (make-byte-vector *simple-cset-boundary* 1))
  58. (define (copy-simple-cset s)
  59. (byte-vector-copy s))
  60. ; don't mistake these for abstractions
  61. (define (simple-cset-code-not-member? s i) (zero? (byte-vector-ref s i)))
  62. (define (simple-cset-code-member? s i) (not (simple-cset-code-not-member? s i)))
  63. (define (simple-cset-ref s i) (byte-vector-ref s i))
  64. (define (simple-cset-set! s i v) (byte-vector-set! s i v))
  65. (define (simple-cset-remove-code! s i) (byte-vector-set! s i 0))
  66. (define (simple-cset-adjoin-code! s i) (byte-vector-set! s i 1))
  67. (define (simple-cset-contains? s char)
  68. (simple-cset-code-member? s (char->scalar-value char)))
  69. (define (simple-cset=? s1 s2)
  70. (byte-vector=? s1 s2))
  71. (define (simple-cset<=? s1 s2)
  72. (or (eq? s1 s2)
  73. (let loop ((i 0))
  74. (if (>= i *simple-cset-boundary*)
  75. #t
  76. (and (<= (simple-cset-ref s1 i) (simple-cset-ref s2 i))
  77. (loop (+ 1 i)))))))
  78. (define (simple-cset-size s)
  79. (let loop ((i 0) (size 0))
  80. (if (>= i *simple-cset-boundary*)
  81. size
  82. (loop (+ 1 i) (+ size (simple-cset-ref s i))))))
  83. (define (simple-cset-count pred s)
  84. (let loop ((i 0) (count 0))
  85. (if (>= i *simple-cset-boundary*)
  86. count
  87. (loop (+ 1 i)
  88. (if (and (simple-cset-code-member? s i) (pred (scalar-value->char i)))
  89. (+ count 1)
  90. count)))))
  91. (define (simple-cset-modify! set s chars)
  92. (for-each (lambda (c) (set s (char->scalar-value c)))
  93. chars)
  94. s)
  95. (define (simple-cset-modify set s chars)
  96. (simple-cset-modify! set (copy-simple-cset s) chars))
  97. (define (simple-cset-adjoin s . chars)
  98. (simple-cset-modify simple-cset-adjoin-code! s chars))
  99. (define (simple-cset-adjoin! s . chars)
  100. (simple-cset-modify! simple-cset-adjoin-code! s chars))
  101. (define (simple-cset-delete s . chars)
  102. (simple-cset-modify simple-cset-remove-code! s chars))
  103. (define (simple-cset-delete! s . chars)
  104. (simple-cset-modify! simple-cset-remove-code! s chars))
  105. ;;; If we represented char sets as a bit set, we could do the following
  106. ;;; trick to pick the lowest bit out of the set:
  107. ;;; (count-bits (xor (- cset 1) cset))
  108. ;;; (But first mask out the bits already scanned by the cursor first.)
  109. (define (simple-cset-cursor-next s cursor)
  110. (let loop ((cur cursor))
  111. (let ((cur (- cur 1)))
  112. (if (or (< cur 0) (simple-cset-code-member? s cur))
  113. cur
  114. (loop cur)))))
  115. (define (end-of-simple-cset? cursor)
  116. (negative? cursor))
  117. (define (simple-cset-cursor-ref cursor)
  118. (scalar-value->char cursor))
  119. (define (simple-cset-for-each proc s)
  120. (let loop ((i 0))
  121. (if (< i *simple-cset-boundary*)
  122. (begin
  123. (if (simple-cset-code-member? s i)
  124. (proc (scalar-value->char i)))
  125. (loop (+ 1 i))))))
  126. (define (simple-cset-fold kons knil s)
  127. (let loop ((i 0) (ans knil))
  128. (if (>= i *simple-cset-boundary*)
  129. ans
  130. (loop (+ 1 i)
  131. (if (simple-cset-code-not-member? s i)
  132. ans
  133. (kons (scalar-value->char i) ans))))))
  134. (define (simple-cset-every? pred s)
  135. (let loop ((i 0))
  136. (cond
  137. ((>= i *simple-cset-boundary*)
  138. #t)
  139. ((or (simple-cset-code-not-member? s i)
  140. (pred (scalar-value->char i)))
  141. (loop (+ 1 i)))
  142. (else
  143. #f))))
  144. (define (simple-cset-any pred s)
  145. (let loop ((i 0))
  146. (cond
  147. ((>= i *simple-cset-boundary*) #f)
  148. ((and (simple-cset-code-member? s i)
  149. (pred (scalar-value->char i))))
  150. (else
  151. (loop (+ 1 i))))))
  152. (define (ucs-range->simple-cset lower upper)
  153. (let ((s (make-empty-simple-cset)))
  154. (let loop ((i lower))
  155. (if (< i upper)
  156. (begin
  157. (simple-cset-adjoin-code! s i)
  158. (loop (+ 1 i)))))
  159. s))
  160. ; Algebra
  161. ; These do various "s[i] := s[i] op val" operations
  162. (define (simple-cset-invert-code! s i v)
  163. (simple-cset-set! s i (- 1 v)))
  164. (define (simple-cset-and-code! s i v)
  165. (if (zero? v)
  166. (simple-cset-remove-code! s i)))
  167. (define (simple-cset-or-code! s i v)
  168. (if (not (zero? v))
  169. (simple-cset-adjoin-code! s i)))
  170. (define (simple-cset-minus-code! s i v)
  171. (if (not (zero? v))
  172. (simple-cset-remove-code! s i)))
  173. (define (simple-cset-xor-code! s i v)
  174. (if (not (zero? v))
  175. (simple-cset-set! s i (- 1 (simple-cset-ref s i)))))
  176. (define (simple-cset-complement s)
  177. (simple-cset-complement! (copy-simple-cset s)))
  178. (define (simple-cset-complement! s)
  179. (byte-vector-iter (lambda (i v) (simple-cset-invert-code! s i v)) s)
  180. s)
  181. (define (simple-cset-op! s simple-csets code-op!)
  182. (for-each (lambda (s2)
  183. (let loop ((i 0))
  184. (if (< i *simple-cset-boundary*)
  185. (begin
  186. (code-op! s i (simple-cset-ref s2 i))
  187. (loop (+ 1 i))))))
  188. simple-csets)
  189. s)
  190. (define (simple-cset-union! s1 . ss)
  191. (simple-cset-op! s1 ss simple-cset-or-code!))
  192. (define (simple-cset-union . ss)
  193. (if (pair? ss)
  194. (apply simple-cset-union!
  195. (byte-vector-copy (car ss))
  196. (cdr ss))
  197. (make-empty-simple-cset)))
  198. (define (simple-cset-intersection! s1 . ss)
  199. (simple-cset-op! s1 ss simple-cset-and-code!))
  200. (define (simple-cset-intersection . ss)
  201. (if (pair? ss)
  202. (apply simple-cset-intersection!
  203. (byte-vector-copy (car ss))
  204. (cdr ss))
  205. (make-full-simple-cset)))
  206. (define (simple-cset-difference! s1 . ss)
  207. (simple-cset-op! s1 ss simple-cset-minus-code!))
  208. (define (simple-cset-difference s1 . ss)
  209. (if (pair? ss)
  210. (apply simple-cset-difference! (copy-simple-cset s1) ss)
  211. (copy-simple-cset s1)))
  212. (define (simple-cset-xor! s1 . ss)
  213. (simple-cset-op! s1 ss simple-cset-xor-code!))
  214. (define (simple-cset-xor . ss)
  215. (if (pair? ss)
  216. (apply simple-cset-xor!
  217. (byte-vector-copy (car ss))
  218. (cdr ss))
  219. (make-empty-simple-cset)))
  220. (define (simple-cset-diff+intersection! s1 s2 . ss)
  221. (byte-vector-iter (lambda (i v)
  222. (cond
  223. ((zero? v)
  224. (simple-cset-remove-code! s2 i))
  225. ((simple-cset-code-member? s2 i)
  226. (simple-cset-remove-code! s1 i))))
  227. s1)
  228. (for-each (lambda (s)
  229. (byte-vector-iter (lambda (i v)
  230. (if (and (not (zero? v))
  231. (simple-cset-code-member? s1 i))
  232. (begin
  233. (simple-cset-remove-code! s1 i)
  234. (simple-cset-adjoin-code! s2 i))))
  235. s))
  236. ss)
  237. (values s1 s2))
  238. ; Compute (c + 37 c + 37^2 c + ...) modulo BOUND, with sleaze thrown
  239. ; in to keep the intermediate values small. (We do the calculation
  240. ; with just enough bits to represent BOUND, masking off high bits at
  241. ; each step in calculation. If this screws up any important properties
  242. ; of the hash function I'd like to hear about it. -Olin)
  243. (define (simple-cset-hash s bound)
  244. ;; The mask that will cover BOUND-1:
  245. (let ((mask (let loop ((i #x10000)) ; Let's skip first 16 iterations, eh?
  246. (if (>= i bound) (- i 1) (loop (+ i i))))))
  247. (let loop ((i (- *simple-cset-boundary* 1)) (ans 0))
  248. (if (< i 0)
  249. (modulo ans bound)
  250. (loop (- i 1)
  251. (if (simple-cset-code-not-member? s i)
  252. ans
  253. (bitwise-and mask (+ (* 37 ans) i))))))))
  254. ;;; Now for the real character sets
  255. (define (make-empty-char-set)
  256. (make-char-set (make-empty-simple-cset)
  257. (make-empty-inversion-list *simple-cset-boundary* (+ 1 #x10ffff))))
  258. (define (make-full-char-set)
  259. (make-char-set (make-full-simple-cset)
  260. (range->inversion-list *simple-cset-boundary* (+ 1 #x10ffff)
  261. *simple-cset-boundary* (+ 1 #x10ffff))))
  262. (define (char-set-copy cs)
  263. (make-char-set (copy-simple-cset (char-set-simple cs))
  264. (inversion-list-copy (char-set-i-list cs))))
  265. ; n-ary version
  266. (define (char-set= . rest)
  267. (or (null? rest)
  268. (let ((cs1 (car rest))
  269. (rest (cdr rest)))
  270. (let loop ((rest rest))
  271. (or (not (pair? rest))
  272. (and (char-set=/2 cs1 (car rest))
  273. (loop (cdr rest))))))))
  274. ; binary version
  275. (define (char-set=/2 cs-1 cs-2)
  276. (and (simple-cset=? (char-set-simple cs-1) (char-set-simple cs-2))
  277. (inversion-list=? (char-set-i-list cs-1)
  278. (char-set-i-list cs-2))))
  279. ; n-ary
  280. (define (char-set<= . rest)
  281. (or (null? rest)
  282. (let ((cs1 (car rest))
  283. (rest (cdr rest)))
  284. (let loop ((cs1 cs1) (rest rest))
  285. (or (not (pair? rest))
  286. (and (char-set<=/2 cs1 (car rest))
  287. (loop (car rest) (cdr rest))))))))
  288. ; binary
  289. (define (char-set<=/2 cs-1 cs-2)
  290. (and (simple-cset<=? (char-set-simple cs-1) (char-set-simple cs-2))
  291. (inversion-list<=? (char-set-i-list cs-1)
  292. (char-set-i-list cs-2))))
  293. (define (inversion-list<=? i-list-1 i-list-2)
  294. (inversion-list=? i-list-1
  295. (inversion-list-intersection i-list-1 i-list-2)))
  296. ;;; Hash
  297. ; We follow Olin's reference implementation:
  298. ;
  299. ; If you keep BOUND small enough, the intermediate calculations will
  300. ; always be fixnums. How small is dependent on the underlying Scheme system;
  301. ; we use a default BOUND of 2^22 = 4194304, which should hack it in
  302. ; Schemes that give you at least 29 signed bits for fixnums. The core
  303. ; calculation that you don't want to overflow is, worst case,
  304. ; (+ 65535 (* 37 (- bound 1)))
  305. ; where 65535 is the max character code. Choose the default BOUND to be the
  306. ; biggest power of two that won't cause this expression to fixnum overflow,
  307. ; and everything will be copacetic.
  308. (define char-set-hash
  309. (opt-lambda (cs (bound 4194304))
  310. (if (not (and (integer? bound)
  311. (exact? bound)
  312. (<= 0 bound)))
  313. (assertion-violation 'char-set-hash "invalid bound" bound))
  314. (let ((bound (if (zero? bound) 4194304 bound)))
  315. (modulo (+ (simple-cset-hash (char-set-simple cs) bound)
  316. (* 37 (inversion-list-hash (char-set-i-list cs) bound)))
  317. bound))))
  318. (define (char-set-contains? cs char)
  319. (if (simple-char? char)
  320. (simple-cset-contains? (char-set-simple cs) char)
  321. (inversion-list-member? (char->scalar-value char)
  322. (char-set-i-list cs))))
  323. (define (char-set-size cs)
  324. (+ (simple-cset-size (char-set-simple cs))
  325. (inversion-list-size (char-set-i-list cs))))
  326. (define (char-set-count pred cset)
  327. (+ (simple-cset-count pred (char-set-simple cset))
  328. (inversion-list-count pred (char-set-i-list cset))))
  329. (define (inversion-list-count pred i-list)
  330. (inversion-list-fold/done? (lambda (v count)
  331. (if (pred (scalar-value->char v))
  332. (+ 1 count)
  333. count))
  334. 0
  335. (lambda (v) #f)
  336. i-list))
  337. (define (make-char-set-char-op simple-cset-op inversion-list-op)
  338. (lambda (cs . chars)
  339. (call-with-values
  340. (lambda () (partition-list simple-char? chars))
  341. (lambda (simple-chars non-simple-chars)
  342. (make-char-set (apply simple-cset-op (char-set-simple cs) simple-chars)
  343. (apply inversion-list-op (char-set-i-list cs)
  344. (map char->scalar-value non-simple-chars)))))))
  345. (define (make-char-set-char-op! simple-cset-op! simple-cset-op
  346. inversion-list-op)
  347. (lambda (cs . chars)
  348. (call-with-values
  349. (lambda () (partition-list simple-char? chars))
  350. (lambda (simple-chars non-simple-chars)
  351. (if (null? non-simple-chars)
  352. (apply simple-cset-op! (char-set-simple cs) simple-chars)
  353. (begin
  354. (set-char-set-simple! cs
  355. (apply simple-cset-op (char-set-simple cs)
  356. simple-chars))
  357. (set-char-set-i-list! cs
  358. (apply inversion-list-op (char-set-i-list cs)
  359. (map char->scalar-value non-simple-chars)))))))
  360. cs))
  361. (define char-set-adjoin
  362. (make-char-set-char-op simple-cset-adjoin inversion-list-adjoin))
  363. (define char-set-adjoin!
  364. (make-char-set-char-op! simple-cset-adjoin! simple-cset-adjoin
  365. inversion-list-adjoin))
  366. (define char-set-delete
  367. (make-char-set-char-op simple-cset-delete inversion-list-remove))
  368. (define char-set-delete!
  369. (make-char-set-char-op! simple-cset-delete! simple-cset-delete
  370. inversion-list-remove))
  371. ;;; Cursors
  372. ; A cursor is either an integer index into the mark vector (-1 for the
  373. ; end-of-char-set cursor) as in the reference implementation, and an
  374. ; inversion-list cursor otherwise.
  375. (define (char-set-cursor cset)
  376. (let ((simple-cursor
  377. (simple-cset-cursor-next (char-set-simple cset)
  378. *simple-cset-boundary*)))
  379. (if (end-of-simple-cset? simple-cursor)
  380. (inversion-list-cursor (char-set-i-list cset))
  381. simple-cursor)))
  382. (define (end-of-char-set? cursor)
  383. (and (inversion-list-cursor? cursor)
  384. (inversion-list-cursor-at-end? cursor)))
  385. (define (char-set-ref cset cursor)
  386. (if (number? cursor)
  387. (simple-cset-cursor-ref cursor)
  388. (scalar-value->char (inversion-list-cursor-ref cursor))))
  389. (define (char-set-cursor-next cset cursor)
  390. (cond
  391. ((number? cursor)
  392. (let ((next (simple-cset-cursor-next (char-set-simple cset) cursor)))
  393. (if (end-of-simple-cset? next)
  394. (inversion-list-cursor (char-set-i-list cset))
  395. next)))
  396. (else
  397. (inversion-list-cursor-next (char-set-i-list cset) cursor))))
  398. (define (char-set-for-each proc cs)
  399. (simple-cset-for-each proc (char-set-simple cs))
  400. (inversion-list-fold/done? (lambda (n _)
  401. (proc (scalar-value->char n))
  402. (unspecific))
  403. #f
  404. (lambda (_) #f)
  405. (char-set-i-list cs)))
  406. ; this is pretty inefficent
  407. (define (char-set-map proc cs)
  408. (let ((simple-cset (make-empty-simple-cset))
  409. (other-scalar-values '()))
  410. (define (adjoin! c)
  411. (let ((c (proc c)))
  412. (if (simple-char? c)
  413. (simple-cset-adjoin! simple-cset c)
  414. (set! other-scalar-values
  415. (cons (char->scalar-value c) other-scalar-values)))))
  416. (char-set-for-each adjoin! cs)
  417. (make-char-set simple-cset
  418. (apply numbers->inversion-list
  419. *simple-cset-boundary* (+ 1 #x10ffff)
  420. other-scalar-values))))
  421. (define (char-set-fold kons knil cs)
  422. (inversion-list-fold/done? (lambda (n v)
  423. (kons (scalar-value->char n) v))
  424. (simple-cset-fold kons knil (char-set-simple cs))
  425. (lambda (_) #f)
  426. (char-set-i-list cs)))
  427. (define (char-set-every pred cs)
  428. (and (simple-cset-every? pred (char-set-simple cs))
  429. (inversion-list-fold/done? (lambda (n v)
  430. (and v
  431. (pred (scalar-value->char n))))
  432. #t
  433. not
  434. (char-set-i-list cs))))
  435. (define (char-set-any pred cs)
  436. (or (simple-cset-any pred (char-set-simple cs))
  437. (inversion-list-fold/done? (lambda (n v)
  438. (or v
  439. (pred (scalar-value->char n))))
  440. #f
  441. values
  442. (char-set-i-list cs))))
  443. (define (base-char-set maybe-base-cs)
  444. (if maybe-base-cs
  445. (char-set-copy maybe-base-cs)
  446. (make-empty-char-set)))
  447. (define char-set-unfold
  448. (opt-lambda (p f g seed (maybe-base-cs #f))
  449. (char-set-unfold! p f g seed
  450. (base-char-set maybe-base-cs))))
  451. (define (char-set-unfold! p f g seed base-cs)
  452. (let loop ((seed seed) (cs base-cs))
  453. (if (p seed) cs ; P says we are done.
  454. (loop (g seed) ; Loop on (G SEED).
  455. (char-set-adjoin! cs (f seed)))))) ; Add (F SEED) to set.
  456. ; converting from and to lists
  457. (define (char-set . chars)
  458. (list->char-set chars))
  459. (define list->char-set
  460. (opt-lambda (chars (maybe-base-cs #f))
  461. (list->char-set! chars
  462. (base-char-set maybe-base-cs))))
  463. (define (list->char-set! chars cs)
  464. (for-each (lambda (c)
  465. (char-set-adjoin! cs c))
  466. chars)
  467. cs)
  468. (define (char-set->list cs)
  469. (char-set-fold cons '() cs))
  470. ; converting to and from strings
  471. (define string->char-set
  472. (opt-lambda (str (maybe-base-cs #f))
  473. (string->char-set! str
  474. (base-char-set maybe-base-cs))))
  475. (define (string->char-set! str cs)
  476. (do ((i (- (string-length str) 1) (- i 1)))
  477. ((< i 0))
  478. (char-set-adjoin! cs (string-ref str i)))
  479. cs)
  480. (define (char-set->string cs)
  481. (let ((ans (make-string (char-set-size cs))))
  482. (char-set-fold (lambda (ch i)
  483. (string-set! ans i ch)
  484. (+ i 1))
  485. 0
  486. cs)
  487. ans))
  488. (define ucs-range->char-set
  489. (opt-lambda (lower upper (error? #f) (maybe-base-cs #f))
  490. (ucs-range->char-set! lower upper error?
  491. (base-char-set maybe-base-cs))))
  492. (define (ucs-range->char-set! lower upper error? base-cs)
  493. (if (negative? lower)
  494. (assertion-violation 'ucs-range->char-set! "negative lower bound" lower))
  495. (if (> lower #x10ffff)
  496. (assertion-violation 'ucs-range->char-set! "invalid lower bound" lower))
  497. (if (negative? upper)
  498. (assertion-violation 'ucs-range->char-set! "negative upper bound" upper))
  499. (if (> upper #x110000)
  500. (assertion-violation 'ucs-range->char-set! "invalid lower bound" upper))
  501. (if (not (<= lower upper))
  502. (assertion-violation 'ucs-range->char-set! "decreasing bounds" lower upper))
  503. (let ((create-inversion-list
  504. (lambda (lower upper)
  505. (cond
  506. ((and (>= lower #xD800)
  507. (>= #xe000 upper))
  508. (make-empty-inversion-list *simple-cset-boundary* (+ 1 #x10ffff)))
  509. ((<= upper #xe000)
  510. (range->inversion-list *simple-cset-boundary* (+ 1 #x10ffff)
  511. lower (min #xd800 upper)))
  512. ((>= lower #xd800)
  513. (range->inversion-list *simple-cset-boundary* (+ 1 #x10ffff)
  514. (max #xe000 lower) upper))
  515. (else
  516. ;; hole
  517. (ranges->inversion-list *simple-cset-boundary* (+ 1 #x10ffff)
  518. (cons lower #xd800)
  519. (cons #xe000 upper)))))))
  520. (char-set-union!
  521. base-cs
  522. (cond
  523. ((>= lower *simple-cset-boundary*)
  524. (make-char-set (make-empty-simple-cset)
  525. (create-inversion-list lower upper)))
  526. ((< upper *simple-cset-boundary*)
  527. (make-char-set (ucs-range->simple-cset lower upper)
  528. (make-empty-inversion-list *simple-cset-boundary* (+ 1 #x10ffff))))
  529. (else
  530. (make-char-set (ucs-range->simple-cset lower *simple-cset-boundary*)
  531. (create-inversion-list *simple-cset-boundary* upper)))))))
  532. (define char-set-filter
  533. (opt-lambda (predicate domain (maybe-base-cs #f))
  534. (char-set-filter! predicate
  535. domain
  536. (base-char-set maybe-base-cs))))
  537. (define (char-set-filter! predicate domain base-cs)
  538. (char-set-fold (lambda (ch _)
  539. (if (predicate ch)
  540. (char-set-adjoin! base-cs ch)))
  541. (unspecific)
  542. domain)
  543. base-cs)
  544. ; {string, char, char-set, char predicate} -> char-set
  545. ; This is called ->CHAR-SET in the SRFI, but that's not a valid R5RS
  546. ; identifier.
  547. (define (x->char-set x)
  548. (cond ((char-set? x) x)
  549. ((string? x) (string->char-set x))
  550. ((char? x) (char-set x))
  551. (else (assertion-violation 'x->char-set "Not a charset, string or char."))))
  552. ; Set algebra
  553. (define *surrogate-complement-i-list*
  554. (inversion-list-complement
  555. (range->inversion-list *simple-cset-boundary* (+ 1 #x10ffff)
  556. #xd800 #xe000)))
  557. (define (char-set-complement cs)
  558. (make-char-set (simple-cset-complement (char-set-simple cs))
  559. (inversion-list-intersection
  560. (inversion-list-complement (char-set-i-list cs))
  561. *surrogate-complement-i-list*)))
  562. (define (char-set-complement! cs)
  563. (set-char-set-simple! cs
  564. (simple-cset-complement! (char-set-simple cs)))
  565. (set-char-set-i-list! cs
  566. (inversion-list-intersection
  567. (inversion-list-complement (char-set-i-list cs))
  568. *surrogate-complement-i-list*))
  569. cs)
  570. (define (make-char-set-op! simple-cset-op! inversion-list-op)
  571. (lambda (cset1 . csets)
  572. (set-char-set-simple! cset1
  573. (apply simple-cset-op!
  574. (char-set-simple cset1)
  575. (map char-set-simple csets)))
  576. (set-char-set-i-list! cset1
  577. (apply inversion-list-op
  578. (char-set-i-list cset1)
  579. (map char-set-i-list csets)))
  580. cset1))
  581. (define (make-char-set-op char-set-op! make-neutral)
  582. (lambda csets
  583. (if (pair? csets)
  584. (apply char-set-op! (char-set-copy (car csets)) (cdr csets))
  585. (make-neutral))))
  586. (define char-set-union!
  587. (make-char-set-op! simple-cset-union! inversion-list-union))
  588. (define char-set-union
  589. (make-char-set-op char-set-union! make-empty-char-set))
  590. (define char-set-intersection!
  591. (make-char-set-op! simple-cset-intersection! inversion-list-intersection))
  592. (define char-set-intersection
  593. (make-char-set-op char-set-intersection! make-full-char-set))
  594. (define char-set-difference!
  595. (make-char-set-op! simple-cset-difference! inversion-list-difference))
  596. (define (char-set-difference cset1 . csets)
  597. (apply char-set-difference! (char-set-copy cset1) csets))
  598. ; copied from inversion-list.scm
  599. (define (binary->n-ary proc/2)
  600. (lambda (arg-1 . args)
  601. (if (and (pair? args)
  602. (null? (cdr args)))
  603. (proc/2 arg-1 (car args))
  604. (let loop ((args args)
  605. (result arg-1))
  606. (if (null? args)
  607. result
  608. (loop (cdr args) (proc/2 result (car args))))))))
  609. (define inversion-list-xor
  610. (binary->n-ary
  611. (lambda (i-list-1 i-list-2)
  612. (inversion-list-union (inversion-list-intersection
  613. (inversion-list-complement i-list-1)
  614. i-list-2)
  615. (inversion-list-intersection
  616. i-list-1
  617. (inversion-list-complement i-list-2))))))
  618. ; Really inefficient for things outside Latin-1
  619. ; WHO NEEDS THIS NONSENSE, ANYWAY?
  620. (define char-set-xor!
  621. (make-char-set-op! simple-cset-xor! inversion-list-xor))
  622. (define char-set-xor
  623. (make-char-set-op char-set-xor! make-empty-char-set))
  624. (define (char-set-diff+intersection! cs1 cs2 . csets)
  625. (call-with-values
  626. (lambda () (apply simple-cset-diff+intersection!
  627. (char-set-simple cs1) (char-set-simple cs2)
  628. (map char-set-simple csets)))
  629. (lambda (simple-diff simple-intersection)
  630. (set-char-set-simple! cs1 simple-diff)
  631. (set-char-set-simple! cs2 simple-intersection)
  632. (let ((i-list-1 (char-set-i-list cs1))
  633. (i-list-2 (char-set-i-list cs2))
  634. (i-list-rest (map char-set-i-list csets)))
  635. (set-char-set-i-list! cs1
  636. (apply inversion-list-difference
  637. i-list-1 i-list-2
  638. i-list-rest))
  639. (set-char-set-i-list! cs2
  640. (inversion-list-intersection
  641. i-list-1
  642. (apply inversion-list-union
  643. i-list-2
  644. i-list-rest)))
  645. (values cs1 cs2)))))
  646. (define (char-set-diff+intersection cs1 . csets)
  647. (apply char-set-diff+intersection!
  648. (char-set-copy cs1)
  649. (make-empty-char-set)
  650. csets))
  651. ;; Byte vector utilities
  652. (define (byte-vector-copy b)
  653. (let* ((size (byte-vector-length b))
  654. (result (make-byte-vector size 0)))
  655. (copy-bytes! b 0 result 0 size)
  656. result))
  657. ;;; Apply P to each index and its char code in S: (P I VAL).
  658. ;;; Used by the set-algebra ops.
  659. (define (byte-vector-iter p s)
  660. (let loop ((i (- (byte-vector-length s) 1)))
  661. (if (>= i 0)
  662. (begin
  663. (p i (byte-vector-ref s i))
  664. (loop (- i 1))))))
  665. ;; Utility for srfi-14-base-char-sets.scm, which follows
  666. ; The range vector is an even-sized vector with [lower, upper)
  667. ; pairs.
  668. (define (range-vector->char-set range-vector)
  669. (let ((size (vector-length range-vector))
  670. (simple-cset (make-empty-simple-cset)))
  671. (let loop ((index 0) (ranges '()))
  672. (if (>= index size)
  673. (make-char-set simple-cset
  674. (apply ranges->inversion-list
  675. *simple-cset-boundary* (+ 1 #x10ffff)
  676. ranges))
  677. (let ((lower (vector-ref range-vector index))
  678. (upper (vector-ref range-vector (+ 1 index))))
  679. (define (fill-simple-cset! lower upper)
  680. (let loop ((scalar-value lower))
  681. (if (< scalar-value upper)
  682. (begin
  683. (simple-cset-adjoin-code! simple-cset scalar-value)
  684. (loop (+ 1 scalar-value))))))
  685. (cond
  686. ((>= lower *simple-cset-boundary*)
  687. (loop (+ 2 index) (cons (cons lower upper) ranges)))
  688. ((< upper *simple-cset-boundary*)
  689. (fill-simple-cset! lower upper)
  690. (loop (+ 2 index) ranges))
  691. (else
  692. (fill-simple-cset! lower *simple-cset-boundary*)
  693. (loop (+ 2 index)
  694. (cons (cons *simple-cset-boundary* upper) ranges)))))))))