unicode-data.scm 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425
  1. ; Part of Scheme 48 1.9. See file COPYING for notices and license.
  2. ; Authors: Mike Sperber
  3. ; Copyright (c) 2005-2006 by Basis Technology Corporation.
  4. ; Parse UnicodeData.txt and various other files from the Unicode
  5. ; consortium, and generate character classification and conversion
  6. ; tables from it.
  7. (define (string-split string at)
  8. (let ((count (string-length string)))
  9. (let loop ((index 0)
  10. (rev-result '()))
  11. (cond
  12. ((>= index count)
  13. (reverse (cons "" rev-result)))
  14. ((string-index string at index)
  15. => (lambda (found)
  16. (loop (+ 1 found)
  17. (cons (substring string index found)
  18. rev-result))))
  19. (else
  20. (reverse (cons (substring string index count)
  21. rev-result)))))))
  22. (define (split-unicode-data-record line)
  23. (string-split line #\;))
  24. (define (maybe-code-point text default)
  25. (if (zero? (string-length text))
  26. default
  27. (string->number text 16)))
  28. (define-record-type code-point-info :code-point-info
  29. (make-code-point-info code-point
  30. name
  31. general-category
  32. combining-class
  33. bidirectional-category-id
  34. canonical-decomposition
  35. compatibility-decomposition
  36. decimal-digit-value
  37. digit-value
  38. numeric-value
  39. mirrored?
  40. unicode-1.0-name
  41. iso-10646-comment
  42. uppercase-code-point
  43. lowercase-code-point
  44. titlecase-code-point)
  45. code-point-info?
  46. ;; number
  47. (code-point code-point-info-code-point)
  48. ;; string
  49. (name code-point-info-name)
  50. ;; :GENERAL-CATEGORY
  51. (general-category code-point-info-general-category)
  52. ;; number
  53. (combining-class code-point-info-combining-class)
  54. ;; symbol
  55. (bidirectional-category-id code-point-info-bidirectional-category-id)
  56. ;; #f or list
  57. (canonical-decomposition code-point-info-canonical-decomposition)
  58. (compatibility-decomposition code-point-info-compatibility-decomposition)
  59. ;; number
  60. (decimal-digit-value code-point-info-decimal-digit-value)
  61. ;; number
  62. (digit-value code-point-info-digit-value)
  63. ;; number
  64. (numeric-value code-point-info-numeric-value)
  65. ;; boolean
  66. (mirrored? code-point-info-mirrored?)
  67. ;; string
  68. (unicode-1.0-name code-point-info-unicode-1.0-name)
  69. ;; string
  70. (iso-10646-comment code-point-info-iso-10646-comment)
  71. ;; number
  72. (uppercase-code-point code-point-info-uppercase-code-point)
  73. ;; number
  74. (lowercase-code-point code-point-info-lowercase-code-point)
  75. ;; number
  76. (titlecase-code-point code-point-info-titlecase-code-point))
  77. (define-record-discloser :code-point-info
  78. (lambda (r)
  79. (list 'code-point-info
  80. (code-point-info-code-point r)
  81. (code-point-info-name r)
  82. (code-point-info-general-category r)
  83. (code-point-info-combining-class r)
  84. (code-point-info-bidirectional-category-id r)
  85. (code-point-info-canonical-decomposition r)
  86. (code-point-info-compatibility-decomposition r)
  87. (code-point-info-decimal-digit-value r)
  88. (code-point-info-digit-value r)
  89. (code-point-info-numeric-value r)
  90. (code-point-info-mirrored? r)
  91. (code-point-info-unicode-1.0-name r)
  92. (code-point-info-iso-10646-comment r)
  93. (code-point-info-uppercase-code-point r)
  94. (code-point-info-lowercase-code-point r)
  95. (code-point-info-titlecase-code-point r))))
  96. (define (unicode-data-record->info line)
  97. (destructure (((code-point-hex
  98. name
  99. general-category-id
  100. combining-class-id
  101. bidirectional-category-text
  102. decomposition-text
  103. decimal-digit-value-text
  104. digit-value-text
  105. numeric-value-text
  106. mirrored-y/n
  107. unicode-1.0-name
  108. iso-10646-comment
  109. uppercase-code-point-hex
  110. lowercase-code-point-hex
  111. titlecase-code-point-hex)
  112. (split-unicode-data-record line)))
  113. (let ((code-point (maybe-code-point code-point-hex #f)))
  114. (let ((uppercase-code-point (maybe-code-point uppercase-code-point-hex code-point))
  115. (lowercase-code-point (maybe-code-point lowercase-code-point-hex code-point))
  116. (titlecase-code-point (maybe-code-point titlecase-code-point-hex code-point))
  117. (decomposition (parse-decomposition decomposition-text)))
  118. (make-code-point-info code-point
  119. name
  120. (id->general-category general-category-id)
  121. (string->number combining-class-id)
  122. (string->symbol bidirectional-category-text)
  123. (and (and (pair? decomposition) (number? (car decomposition)))
  124. decomposition)
  125. (and (and (pair? decomposition) (symbol? (car decomposition)))
  126. (cdr decomposition))
  127. (string->number decimal-digit-value-text)
  128. (string->number digit-value-text)
  129. (string->number numeric-value-text)
  130. (string=? mirrored-y/n "Y")
  131. unicode-1.0-name
  132. iso-10646-comment
  133. uppercase-code-point
  134. lowercase-code-point
  135. titlecase-code-point)))))
  136. ;; return #f or a list, which contains the scalar values of the decompositon
  137. ;; for compatibility decompositions, the tag is prepended as a symbol
  138. (define (parse-decomposition d)
  139. (cond
  140. ((zero? (string-length d))
  141. #f)
  142. ((char=? #\< (string-ref d 0))
  143. (let ((after (string-index d #\space)))
  144. (cons (string->symbol (substring d 0 after))
  145. (call-with-values
  146. (lambda ()
  147. (parse-scalar-values d after))
  148. (lambda (l i) l)))))
  149. (else
  150. (call-with-values
  151. (lambda ()
  152. (parse-scalar-values d 0))
  153. (lambda (l i) l)))))
  154. ; for EXPANDED-CODE-POINT-INFO-SOURCE
  155. (define (code-point-info-with-code-point+name info code-point name)
  156. (make-code-point-info code-point
  157. name
  158. (code-point-info-general-category info)
  159. (code-point-info-combining-class info)
  160. (code-point-info-bidirectional-category-id info)
  161. (code-point-info-canonical-decomposition info)
  162. (code-point-info-compatibility-decomposition info)
  163. (code-point-info-decimal-digit-value info)
  164. (code-point-info-digit-value info)
  165. (code-point-info-numeric-value info)
  166. (code-point-info-mirrored? info)
  167. (code-point-info-unicode-1.0-name info)
  168. (code-point-info-iso-10646-comment info)
  169. code-point code-point code-point)) ; kludge
  170. ; expand the code-point ranges that look like this:
  171. ; 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
  172. ; 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
  173. ; returns a thunk that returns the infos from consecutive calls,
  174. ; then #f
  175. (define (expanded-code-point-info-source infos)
  176. (let ((first-info #f)
  177. (code-point #f)
  178. (last-code-point #f)
  179. (name-base #f))
  180. (lambda ()
  181. (let again ()
  182. (cond
  183. (first-info
  184. (if (<= code-point last-code-point)
  185. (begin
  186. (set! code-point (+ 1 code-point))
  187. (code-point-info-with-code-point+name
  188. first-info
  189. (- code-point 1)
  190. name-base)) ; kludge for speed; should be:
  191. ; (string-append name-base (number->string code-point 16))
  192. (begin
  193. (set! first-info #f)
  194. (again))))
  195. ((null? infos)
  196. #f)
  197. (else
  198. (let* ((info (car infos))
  199. (name (code-point-info-name info)))
  200. (cond
  201. ((and (string-prefix? "<" name)
  202. (string-suffix? ", First>" name))
  203. (set! first-info info)
  204. (set! code-point (code-point-info-code-point info))
  205. (set! last-code-point (code-point-info-code-point (cadr infos)))
  206. (set! name-base (string-append
  207. (substring name
  208. 1 ; (string-length "<")
  209. (- (string-length name)
  210. 8 ; (string-length ", First>")
  211. ))
  212. "-<code point>")) ; kludge, see above
  213. (set! infos (cddr infos))
  214. (again))
  215. (else
  216. (set! infos (cdr infos))
  217. info)))))))))
  218. (define (for-each-expanded-code-point-info proc infos)
  219. (let ((source (expanded-code-point-info-source infos)))
  220. (let loop ()
  221. (let ((info (source)))
  222. (if info
  223. (begin
  224. (proc info)
  225. (loop)))))))
  226. (define (read-line port)
  227. (let loop ((l '()))
  228. (let ((c (read-char port)))
  229. (if (eof-object? c)
  230. c
  231. (if (char=? c #\newline)
  232. (list->string (reverse l))
  233. (loop (cons c l)))))))
  234. (define (parse-unicode-data filename)
  235. (call-with-input-file filename
  236. (lambda (port)
  237. (let loop ((rev-infos '()))
  238. (let ((thing (read-line port)))
  239. (if (eof-object? thing)
  240. (reverse rev-infos)
  241. (loop (cons (unicode-data-record->info thing) rev-infos))))))))
  242. ; Mapping the relevant info (general category + case mappings) into a
  243. ; compact array
  244. (define (mapping-offsets infos accessor)
  245. (let loop ((infos infos)
  246. (offsets '()))
  247. (if (null? infos)
  248. (list->vector offsets)
  249. (let* ((info (car infos))
  250. (code-point (code-point-info-code-point info))
  251. (other (accessor info))
  252. (offset (- other code-point)))
  253. (if (member offset offsets)
  254. (loop (cdr infos) offsets)
  255. (loop (cdr infos) (cons offset offsets)))))))
  256. (define (vector-index vector value)
  257. (let ((count (vector-length vector)))
  258. (let loop ((i 0))
  259. (cond
  260. ((>= i count) #f)
  261. ((equal? value (vector-ref vector i)) i)
  262. (else (loop (+ 1 i)))))))
  263. (define (code-point-info->case+general-category-encoding
  264. info
  265. specialcasing?
  266. special-lowercase-table special-uppercase-table
  267. uppercase-offsets lowercase-offsets titlecase-offsets
  268. uppercase-index-width lowercase-index-width titlecase-index-width)
  269. (let ((code-point (code-point-info-code-point info)))
  270. (let ((uppercase-index (vector-index uppercase-offsets
  271. (- (code-point-info-uppercase-code-point info)
  272. code-point)))
  273. (lowercase-index (vector-index lowercase-offsets
  274. (- (code-point-info-lowercase-code-point info)
  275. code-point)))
  276. (titlecase-index (vector-index titlecase-offsets
  277. (- (code-point-info-titlecase-code-point info)
  278. code-point)))
  279. (uppercase? (or (eq? (general-category uppercase-letter)
  280. (code-point-info-general-category info))
  281. (table-ref special-uppercase-table code-point)))
  282. (lowercase? (or (eq? (general-category lowercase-letter)
  283. (code-point-info-general-category info))
  284. (table-ref special-lowercase-table code-point))))
  285. (bitwise-ior
  286. (arithmetic-shift
  287. (bitwise-ior
  288. (arithmetic-shift (bitwise-ior
  289. (arithmetic-shift
  290. (bitwise-ior
  291. (arithmetic-shift
  292. (bitwise-ior (if specialcasing? 4 0)
  293. (if uppercase? 2 0)
  294. (if lowercase? 1 0))
  295. uppercase-index-width)
  296. uppercase-index)
  297. lowercase-index-width)
  298. lowercase-index)
  299. titlecase-index-width)
  300. titlecase-index)
  301. *general-category-bits*)
  302. (general-category-index (code-point-info-general-category info))))))
  303. (define (code-point-encoding-uppercase? encoding
  304. uppercase-index-width lowercase-index-width titlecase-index-width)
  305. (not
  306. (zero?
  307. (bitwise-and 1
  308. (arithmetic-shift encoding
  309. (- (+ 1
  310. uppercase-index-width
  311. lowercase-index-width
  312. titlecase-index-width
  313. *general-category-bits*)))))))
  314. (define (code-point-encoding-lowercase? encoding
  315. uppercase-index-width lowercase-index-width titlecase-index-width)
  316. (not
  317. (zero?
  318. (bitwise-and 1
  319. (arithmetic-shift encoding
  320. (- (+ uppercase-index-width
  321. lowercase-index-width
  322. titlecase-index-width
  323. *general-category-bits*)))))))
  324. (define (lookup-by-offset-index code-point offset-index offsets)
  325. (+ code-point (vector-ref offsets offset-index)))
  326. (define (code-point-encoding-uppercase-code-point code-point encoding
  327. uppercase-offsets
  328. uppercase-index-width lowercase-index-width titlecase-index-width)
  329. (lookup-by-offset-index
  330. code-point
  331. (bitwise-and (- (arithmetic-shift 1 uppercase-index-width) 1)
  332. (arithmetic-shift encoding
  333. (- (+ lowercase-index-width titlecase-index-width *general-category-bits*))))
  334. uppercase-offsets))
  335. (define (code-point-encoding-lowercase-code-point code-point encoding
  336. lowercase-offsets
  337. uppercase-index-width lowercase-index-width titlecase-index-width)
  338. (lookup-by-offset-index
  339. code-point
  340. (bitwise-and (- (arithmetic-shift 1 lowercase-index-width) 1)
  341. (arithmetic-shift encoding
  342. (- (+ titlecase-index-width *general-category-bits*))))
  343. lowercase-offsets))
  344. (define (code-point-encoding-titlecase-code-point code-point encoding
  345. titlecase-offsets
  346. uppercase-index-width lowercase-index-width titlecase-index-width)
  347. (lookup-by-offset-index
  348. code-point
  349. (bitwise-and (- (arithmetic-shift 1 titlecase-index-width) 1)
  350. (arithmetic-shift encoding (- *general-category-bits*)))
  351. titlecase-offsets))
  352. (define *code-point-encoding-general-category-mask*
  353. (- (arithmetic-shift 1 *general-category-bits*) 1))
  354. (define (code-point-encoding-general-category encoding)
  355. (vector-ref general-categories
  356. (bitwise-and encoding *code-point-encoding-general-category-mask*)))
  357. (define (max-code-point infos)
  358. (let loop ((max 0) (infos infos))
  359. (cond
  360. ((null? infos) max)
  361. ((> (code-point-info-code-point (car infos))
  362. max)
  363. (loop (code-point-info-code-point (car infos)) (cdr infos)))
  364. (else (loop max (cdr infos))))))
  365. ; returns a THUNK that will return for each code-point in sequence
  366. ; (PROC <code-point>) or DEFAULT if there's no info.
  367. ; assumes INFOS are sorted
  368. (define (make-consecutive-info-source source make-default proc)
  369. (let ((next-info #f)
  370. (last-code-point -1))
  371. (lambda ()
  372. (define (upto info)
  373. (if (< last-code-point (code-point-info-code-point info))
  374. (begin
  375. (set! next-info info)
  376. (proc (make-default last-code-point)))
  377. (begin
  378. (set! next-info #f)
  379. ;; scalar values only
  380. (if (eq? (code-point-info-general-category info)
  381. (general-category surrogate))
  382. (proc (make-default last-code-point))
  383. (proc info)))))
  384. (set! last-code-point (+ 1 last-code-point))
  385. (cond
  386. ((or next-info (source)) => upto)
  387. (else #f)))))
  388. ; Dealing with PropList.txt
  389. (define (parse-proplist-for-upper/lowercase filename)
  390. (call-with-input-file filename
  391. (lambda (port)
  392. (let ((uppercase (make-integer-table)) (lowercase (make-integer-table)))
  393. (let loop ()
  394. (let ((thing (read-line port)))
  395. (if (eof-object? thing)
  396. (values uppercase lowercase)
  397. (call-with-values
  398. (lambda ()
  399. (extract-upper/lowercase thing))
  400. (lambda (uppers lowers)
  401. (for-each (lambda (u)
  402. (table-set! uppercase u #t))
  403. uppers)
  404. (for-each (lambda (l)
  405. (table-set! lowercase l #t))
  406. lowers)
  407. (loop))))))))))
  408. (define (extract-upper/lowercase line)
  409. (cond
  410. ((string-prefix? "#" line)
  411. (values '() '()))
  412. ((string-contains line "Other_Uppercase")
  413. (values (proplist-line-range line)
  414. '()))
  415. ((string-contains line "Other_Lowercase")
  416. (values '()
  417. (proplist-line-range line)))
  418. (else
  419. (values '() '()))))
  420. (define (proplist-line-range line)
  421. (let* ((i1 (string-skip line char-set:hex-digit))
  422. (first (string->number (substring line 0 i1) 16)))
  423. (if (char=? #\. (string-ref line i1))
  424. (let* ((i2 (string-skip line #\. i1))
  425. (i3 (string-skip line char-set:hex-digit i2))
  426. (last (string->number (substring line i2 i3) 16)))
  427. (let loop ((last last) (range '()))
  428. (if (= last first)
  429. (cons last range)
  430. (loop (- last 1) (cons last range)))))
  431. (list first))))
  432. ; assumes START points to whitespace or the first digit
  433. ; returns list of scalar values + position after sequence
  434. ; (possibly after trailing semicolon)
  435. (define (parse-scalar-values s start)
  436. (let ((size (string-length s)))
  437. (let loop ((start start) (rev-values '()))
  438. (let ((i1 (string-skip s char-set:whitespace start)))
  439. (cond
  440. ((not i1)
  441. (values (reverse rev-values) (+ start 1)))
  442. ((char=? #\; (string-ref s i1))
  443. (values (reverse rev-values) (+ i1 1)))
  444. (else
  445. (let* ((i2 (or (string-skip s char-set:hex-digit i1)
  446. size))
  447. (n (string->number (substring s i1 i2) 16)))
  448. (loop i2 (cons n rev-values)))))))))
  449. (define-record-type specialcasing :specialcasing
  450. (make-specialcasing scalar-value
  451. lowercase titlecase uppercase foldcase
  452. final-sigma?)
  453. specialcasing?
  454. (scalar-value specialcasing-scalar-value)
  455. (lowercase specialcasing-lowercase)
  456. (titlecase specialcasing-titlecase)
  457. (uppercase specialcasing-uppercase)
  458. ;; This will actually come from CaseFolding.txt
  459. (foldcase specialcasing-foldcase set-specialcasing-foldcase!)
  460. (final-sigma? specialcasing-final-sigma?))
  461. (define (parse-specialcasing-line line)
  462. (let* ((i1 (string-skip line char-set:hex-digit 0))
  463. (n (string->number (substring line 0 i1) 16)))
  464. (call-with-values
  465. (lambda () (parse-scalar-values line (+ 1 i1)))
  466. (lambda (lowercase i2)
  467. (call-with-values
  468. (lambda () (parse-scalar-values line i2))
  469. (lambda (titlecase i3)
  470. (call-with-values
  471. (lambda () (parse-scalar-values line i3))
  472. (lambda (uppercase i4)
  473. (let ((i5 (or (string-index line #\; (+ 1 i4))
  474. (string-index line #\# (+ 1 i4))
  475. (string-length line))))
  476. (let ((conditions (string-trim-both (substring line i4 i5))))
  477. (if (or (string=? "" conditions)
  478. (string=? "Final_Sigma" conditions))
  479. (make-specialcasing n
  480. lowercase titlecase uppercase #f
  481. (string=? conditions "Final_Sigma"))
  482. #f)))))))))))
  483. (define (parse-specialcasing filename)
  484. (call-with-input-file filename
  485. (lambda (port)
  486. (let loop ((specialcasings '()))
  487. (let ((thing (read-line port)))
  488. (if (eof-object? thing)
  489. specialcasings
  490. (cond
  491. ((and (not (string=? "" thing))
  492. (not (char=? #\# (string-ref thing 0)))
  493. (parse-specialcasing-line thing))
  494. => (lambda (sc)
  495. (loop (cons sc specialcasings))))
  496. (else (loop specialcasings)))))))))
  497. ; we only extract the common and full case foldings
  498. (define (parse-casefolding-line line)
  499. (let* ((i1 (string-skip line char-set:hex-digit 0))
  500. (n (string->number (substring line 0 i1) 16))
  501. (i2 (string-skip line char-set:whitespace (+ 1 i1)))
  502. (status (string-ref line i2)))
  503. (call-with-values
  504. (lambda ()
  505. (parse-scalar-values line (+ 2 i2)))
  506. (lambda (scalar-values i)
  507. (cond
  508. ((or (char=? status #\C)
  509. (char=? status #\F))
  510. (cons n (cons status scalar-values)))
  511. ((> (length scalar-values) 1)
  512. (error "multi-character common case-folding mapping"))
  513. (else #f))))))
  514. (define (parse-casefolding filename)
  515. (call-with-input-file filename
  516. (lambda (port)
  517. (let loop ((casefoldings '()))
  518. (let ((thing (read-line port)))
  519. (cond
  520. ((eof-object? thing) casefoldings)
  521. ((and (not (string=? "" thing))
  522. (not (char=? #\# (string-ref thing 0)))
  523. (parse-casefolding-line thing))
  524. => (lambda (folding)
  525. (loop (cons folding casefoldings))))
  526. (else (loop casefoldings))))))))
  527. (define (merge-specialcasings+casefoldings! specialcasings casefoldings)
  528. (for-each
  529. (lambda (casefolding)
  530. (let ((sv (car casefolding))
  531. (status (cadr casefolding))
  532. (folding (cddr casefolding)))
  533. (cond
  534. ((find (lambda (specialcasing)
  535. (= (specialcasing-scalar-value specialcasing) sv))
  536. specialcasings)
  537. => (lambda (specialcasing)
  538. (set-specialcasing-foldcase! specialcasing folding)))
  539. ((char=? status #\F) ; the others will be covered by UnicodeData.txt
  540. (let ((sv-list (list sv)))
  541. (set! specialcasings
  542. (cons
  543. (make-specialcasing sv
  544. sv-list sv-list sv-list
  545. folding
  546. #f)
  547. specialcasings)))))))
  548. casefoldings)
  549. specialcasings)
  550. (define (parse-specialcasing+casefolding specialcasing-filename casefolding-filename)
  551. (let ((specialcasings (parse-specialcasing specialcasing-filename))
  552. (casefoldings (parse-casefolding casefolding-filename)))
  553. (merge-specialcasings+casefoldings! specialcasings casefoldings)))
  554. (define (list-prefix? l1 l2)
  555. (let loop ((l1 l1) (l2 l2))
  556. (cond
  557. ((null? l1) #t)
  558. ((null? l2) #f)
  559. ((equal? (car l1) (car l2))
  560. (loop (cdr l1) (cdr l2)))
  561. (else #f))))
  562. ; We return two lists: a list of :SPECIALCASING records where the
  563. ; xxxCASE fields are replaced by (offset . length) pairs into the
  564. ; second list, which contains all the case mappings jumbled together.
  565. (define (specialcasing-encoding specialcasings)
  566. (let ((casings '()))
  567. (define (add-casing! l)
  568. (let loop ((rest casings)
  569. (index 0))
  570. (cond
  571. ((null? rest)
  572. (set! casings (append casings l))
  573. index)
  574. ((list-prefix? l rest)
  575. index)
  576. (else
  577. (loop (cdr rest) (+ 1 index))))))
  578. (define (transform-specialcasing s)
  579. (let ((lowercase (cons (add-casing! (specialcasing-lowercase s))
  580. (length (specialcasing-lowercase s))))
  581. (titlecase (cons (add-casing! (specialcasing-titlecase s))
  582. (length (specialcasing-titlecase s))))
  583. (uppercase (cons (add-casing! (specialcasing-uppercase s))
  584. (length (specialcasing-uppercase s))))
  585. (foldcase (cons (add-casing! (specialcasing-foldcase s))
  586. (length (specialcasing-foldcase s)))))
  587. (make-specialcasing (specialcasing-scalar-value s)
  588. lowercase titlecase uppercase foldcase
  589. (specialcasing-final-sigma? s))))
  590. (let ((transformed
  591. (map transform-specialcasing specialcasings)))
  592. (values transformed
  593. casings))))
  594. (define (specialcasing-encoding-ref casings offset size)
  595. (let loop ((i 0) (r '()))
  596. (if (>= i size)
  597. (reverse r)
  598. (loop (+ 1 i)
  599. (cons (vector-ref casings (+ offset i))
  600. r)))))
  601. ; for testing
  602. (define (check-specialcasing-encodings specialcasings)
  603. (call-with-values
  604. (lambda () (specialcasing-encoding specialcasings))
  605. (lambda (encodings casings)
  606. (let ((casings (list->vector casings)))
  607. (for-each
  608. (lambda (specialcasing encoding)
  609. (define (check select)
  610. (let ((pair (select encoding))
  611. (reference (select specialcasing)))
  612. (if (not
  613. (equal? reference
  614. (specialcasing-encoding-ref casings
  615. (car pair) (cdr pair))))
  616. (error "encoding failure" encoding
  617. reference (specialcasing-encoding-ref casings
  618. (car pair) (cdr pair))))))
  619. (check specialcasing-lowercase)
  620. (check specialcasing-uppercase)
  621. (check specialcasing-titlecase)
  622. (check specialcasing-foldcase))
  623. specialcasings encodings)))))
  624. (define (specialcasings->table specialcasings)
  625. (let ((table (make-integer-table)))
  626. (for-each (lambda (s)
  627. (table-set! table (specialcasing-scalar-value s)
  628. s))
  629. specialcasings)
  630. table))
  631. (define (make-scalar-value-case+general-category-encoding-tables
  632. infos
  633. special-lowercase-table special-uppercase-table
  634. specialcasings)
  635. (let ((uppercase-offsets (mapping-offsets infos code-point-info-uppercase-code-point))
  636. (lowercase-offsets (mapping-offsets infos code-point-info-lowercase-code-point))
  637. (titlecase-offsets (mapping-offsets infos code-point-info-titlecase-code-point)))
  638. (let ((uppercase-index-width (bits-necessary (vector-length uppercase-offsets)))
  639. (lowercase-index-width (bits-necessary (vector-length lowercase-offsets)))
  640. (titlecase-index-width (bits-necessary (vector-length titlecase-offsets)))
  641. (specialcasings-table (specialcasings->table specialcasings))
  642. (block-size (expt 2 *block-bits*)))
  643. (call-with-values
  644. (lambda ()
  645. (compute-compact-table
  646. (make-consecutive-info-source
  647. (expanded-code-point-info-source infos)
  648. (lambda (code-point)
  649. (make-code-point-info code-point
  650. "<unassigned>"
  651. (general-category unassigned)
  652. #f #f #f #f #f #f #f #f #f #f
  653. code-point code-point code-point))
  654. (lambda (info)
  655. (code-point-info->case+general-category-encoding
  656. info
  657. (table-ref specialcasings-table
  658. (code-point-info-code-point info))
  659. special-lowercase-table special-uppercase-table
  660. uppercase-offsets lowercase-offsets titlecase-offsets
  661. uppercase-index-width lowercase-index-width titlecase-index-width)))
  662. block-size))
  663. (lambda (indices encodings)
  664. (values indices encodings
  665. uppercase-offsets lowercase-offsets titlecase-offsets))))))
  666. ; saves a couple of kilobyes, but probably not worthwhile
  667. (define (write-vector-code/rll name vector port)
  668. (write `(define ,name (make-vector ,(vector-length vector)))
  669. port)
  670. (newline port)
  671. (let loop ((values (vector->list vector))
  672. (index 0))
  673. (cond
  674. ((null? values))
  675. ((or (null? (cdr values))
  676. (not (equal? (car values) (cadr values))))
  677. (write `(vector-set! ,name ,index ,(car values))
  678. port)
  679. (newline port)
  680. (loop (cdr values) (+ 1 index)))
  681. (else
  682. (let ((value (car values)))
  683. (let inner-loop ((values values)
  684. (last-index index))
  685. (cond
  686. ((or (null? values)
  687. (not (equal? (car values) value)))
  688. (write
  689. `(do ((i ,index (+ 1 i)))
  690. ((>= i ,last-index))
  691. (vector-set! ,name i ,value))
  692. port)
  693. (newline port)
  694. (loop values last-index))
  695. (else
  696. (inner-loop (cdr values) (+ 1 last-index))))))))))
  697. (define (create-unicode-tables unicode-data-filename
  698. proplist-filename
  699. specialcasing-filename
  700. casefolding-filename
  701. composition-exclusions-filename
  702. category-output-file
  703. syntax-info-output-file
  704. normalization-output-file
  705. srfi-14-base-output-file)
  706. (let ((infos (parse-unicode-data unicode-data-filename))
  707. (specialcasings (parse-specialcasing+casefolding specialcasing-filename
  708. casefolding-filename)))
  709. (call-with-values
  710. (lambda ()
  711. (parse-proplist-for-upper/lowercase proplist-filename))
  712. (lambda (special-uppercase-table special-lowercase-table)
  713. (call-with-output-file category-output-file
  714. (lambda (port)
  715. (display "; Automatically generated by WRITE-UNICODE-CATEGORY-TABLES; do not edit."
  716. port)
  717. (newline port)
  718. (newline port)
  719. (write-unicode-category-tables infos
  720. special-uppercase-table special-lowercase-table
  721. specialcasings
  722. port)
  723. (write-specialcasings-tables specialcasings port)))
  724. (call-with-output-file syntax-info-output-file
  725. (lambda (port)
  726. (display "; Automatically generated by WRITE-UNICODE-CATEGORY-TABLES; do not edit."
  727. port)
  728. (newline port)
  729. (newline port)
  730. (write-syntax-info infos port)
  731. (newline port)))
  732. (write-srfi-14-base-char-sets infos srfi-14-base-output-file)
  733. (call-with-output-file normalization-output-file
  734. (lambda (port)
  735. (display "; Automatically generated by WRITE-UNICODE-CATEGORY-TABLES; do not edit."
  736. port)
  737. (newline port)
  738. (newline port)
  739. (write-normalization-tables
  740. infos
  741. (parse-composition-exclusions composition-exclusions-filename)
  742. port)))))))
  743. (define *block-bits* 8) ; better than 9, at least
  744. (define (write-unicode-category-tables infos
  745. special-uppercase-table special-lowercase-table
  746. specialcasings
  747. port)
  748. (call-with-values
  749. (lambda ()
  750. (make-scalar-value-case+general-category-encoding-tables
  751. infos
  752. special-lowercase-table special-uppercase-table
  753. specialcasings))
  754. (lambda (indices
  755. encodings
  756. uppercase-offsets lowercase-offsets titlecase-offsets)
  757. (write `(define *encoding-table-block-bits* ,*block-bits*)
  758. port)
  759. (newline port)
  760. (newline port)
  761. (write `(define *uppercase-index-width*
  762. ,(bits-necessary (vector-length uppercase-offsets)))
  763. port)
  764. (newline port)
  765. (write `(define *lowercase-index-width*
  766. ,(bits-necessary (vector-length lowercase-offsets)))
  767. port)
  768. (newline port)
  769. (write `(define *titlecase-index-width*
  770. ,(bits-necessary (vector-length titlecase-offsets)))
  771. port)
  772. (newline port)
  773. (newline port)
  774. (write `(define *scalar-value-info-indices* ',indices)
  775. port)
  776. (newline port)
  777. (write `(define *scalar-value-info-encodings* ',encodings)
  778. port)
  779. (newline port)
  780. (newline port)
  781. (write `(define *uppercase-offsets* ',uppercase-offsets)
  782. port)
  783. (newline port)
  784. (write `(define *lowercase-offsets* ',lowercase-offsets)
  785. port)
  786. (newline port)
  787. (write `(define *titlecase-offsets* ',titlecase-offsets)
  788. port)
  789. (newline port)
  790. (newline port))))
  791. (define (write-specialcasings-tables specialcasings port)
  792. (call-with-values
  793. (lambda () (specialcasing-encoding specialcasings))
  794. (lambda (encodings casings)
  795. ;; we write it out here to avoid introducing yet another file
  796. ;; into the UNICODE-CHAR-MAPS package
  797. (write
  798. '(define-record-type specialcasing :specialcasing
  799. (make-specialcasing scalar-value
  800. lowercase-start lowercase-length
  801. titlecase-start titlecase-length
  802. uppercase-start uppercase-length
  803. foldcase-start foldcase-length
  804. final-sigma?)
  805. specialcasing?
  806. (scalar-value specialcasing-scalar-value)
  807. (lowercase-start specialcasing-lowercase-start)
  808. (lowercase-length specialcasing-lowercase-length)
  809. (titlecase-start specialcasing-titlecase-start)
  810. (titlecase-length specialcasing-titlecase-length)
  811. (uppercase-start specialcasing-uppercase-start)
  812. (uppercase-length specialcasing-uppercase-length)
  813. (foldcase-start specialcasing-foldcase-start)
  814. (foldcase-length specialcasing-foldcase-length)
  815. (final-sigma? specialcasing-final-sigma?))
  816. port)
  817. (newline port)
  818. (newline port)
  819. (write `(define *specialcasing-table* (make-integer-table)) port)
  820. (newline port)
  821. (newline port)
  822. (for-each
  823. (lambda (c)
  824. (write
  825. `(table-set! *specialcasing-table*
  826. ,(specialcasing-scalar-value c)
  827. (make-specialcasing
  828. ,(specialcasing-scalar-value c)
  829. ,(car (specialcasing-lowercase c))
  830. ,(cdr (specialcasing-lowercase c))
  831. ,(car (specialcasing-titlecase c))
  832. ,(cdr (specialcasing-titlecase c))
  833. ,(car (specialcasing-uppercase c))
  834. ,(cdr (specialcasing-uppercase c))
  835. ,(car (specialcasing-foldcase c))
  836. ,(cdr (specialcasing-foldcase c))
  837. ,(specialcasing-final-sigma? c)))
  838. port)
  839. (newline port))
  840. encodings)
  841. (newline port)
  842. (write `(define *specialcasings* (list->string (map scalar-value->char ',casings))) port)
  843. (newline port)
  844. (newline port))))
  845. ;; Read syntax
  846. (define (write-syntax-info infos port)
  847. (write `(define *non-symbol-constituents-above-127*
  848. ',(list->vector (non-symbol-constituents-above-127 infos)))
  849. port)
  850. (newline port)
  851. (newline port)
  852. (write `(define *whitespaces*
  853. ',(list->vector (whitespaces infos)))
  854. port)
  855. (newline port))
  856. (define *symbol-constituent-general-categories*
  857. (list (general-category uppercase-letter)
  858. (general-category lowercase-letter)
  859. (general-category titlecase-letter)
  860. (general-category modified-letter)
  861. (general-category other-letter)
  862. (general-category non-spacing-mark)
  863. (general-category combining-spacing-mark)
  864. (general-category enclosing-mark)
  865. (general-category decimal-digit-number)
  866. (general-category letter-number)
  867. (general-category other-number)
  868. (general-category dash-punctuation)
  869. (general-category connector-punctuation)
  870. (general-category other-punctuation)
  871. (general-category currency-symbol)
  872. (general-category mathematical-symbol)
  873. (general-category modifier-symbol)
  874. (general-category other-symbol)
  875. (general-category private-use-character)))
  876. (define (symbol-constituent-above-127? info)
  877. (memq (code-point-info-general-category info)
  878. *symbol-constituent-general-categories*))
  879. (define (non-symbol-constituents-above-127 infos)
  880. (let ((reverse-non-constituents '()))
  881. (for-each-expanded-code-point-info
  882. (lambda (info)
  883. (let ((cp (code-point-info-code-point info)))
  884. (if (and (> cp 127)
  885. (not (eq? (general-category surrogate)
  886. (code-point-info-general-category info)))
  887. (not (symbol-constituent-above-127? info)))
  888. (set! reverse-non-constituents
  889. (cons cp reverse-non-constituents)))))
  890. infos)
  891. (reverse reverse-non-constituents)))
  892. (define (whitespaces infos)
  893. (let ((reverse-whitespaces '()))
  894. (for-each-expanded-code-point-info
  895. (lambda (info)
  896. (if (eq? (general-category-primary-category
  897. (code-point-info-general-category info))
  898. (primary-category separator))
  899. (set! reverse-whitespaces
  900. (cons (code-point-info-code-point info)
  901. reverse-whitespaces))))
  902. infos)
  903. (sort-list (append '(#x009 #x00a #x00b #x00c #x00d #x085)
  904. reverse-whitespaces)
  905. <)))
  906. (define (write-srfi-14-base-char-sets infos output-file)
  907. (call-with-output-file output-file
  908. (lambda (port)
  909. (display "; Automatically generated by WRITE-SRFI-14-BASE-CHAR-SETS; do not edit."
  910. port)
  911. (newline port)
  912. (newline port)
  913. (let-syntax
  914. ((general-category-predicate
  915. (syntax-rules ()
  916. ((general-category-predicate ?name)
  917. (lambda (info)
  918. (eq? (code-point-info-general-category info)
  919. (general-category ?name))))))
  920. (primary-category-predicate
  921. (syntax-rules ()
  922. ((primary-category-predicate ?name)
  923. (lambda (info)
  924. (eq? (general-category-primary-category
  925. (code-point-info-general-category info))
  926. (primary-category ?name)))))))
  927. (write-srfi-14-base-char-set-definition
  928. 'char-set:lower-case
  929. srfi-14-lower-case?
  930. infos port)
  931. (write-srfi-14-base-char-set-definition
  932. 'char-set:upper-case
  933. srfi-14-upper-case?
  934. infos port)
  935. (write-srfi-14-base-char-set-definition
  936. 'char-set:title-case
  937. (general-category-predicate titlecase-letter)
  938. infos port)
  939. (write-srfi-14-base-char-set-definition
  940. 'char-set:letter
  941. (primary-category-predicate letter)
  942. infos port)
  943. (write-srfi-14-base-char-set-definition
  944. 'char-set:digit
  945. (general-category-predicate decimal-digit-number)
  946. infos port)
  947. (write-srfi-14-base-char-set-definition
  948. 'char-set:mark
  949. (primary-category-predicate mark)
  950. infos port)
  951. (write-srfi-14-base-char-set-definition
  952. 'char-set:separator
  953. (primary-category-predicate separator)
  954. infos port)
  955. (write-srfi-14-base-char-set-definition
  956. 'char-set:punctuation
  957. (primary-category-predicate punctuation)
  958. infos port)
  959. (write-srfi-14-base-char-set-definition
  960. 'char-set:symbol
  961. (primary-category-predicate symbol)
  962. infos port)
  963. (write-srfi-14-base-char-set-definition
  964. 'char-set:space-separator
  965. (general-category-predicate space-separator)
  966. infos port)))))
  967. ; SRFI 14 has funny notions of lower case and upper case
  968. (define (srfi-14-lower-case? info)
  969. (let ((cp (code-point-info-code-point info)))
  970. (and (not (and (>= cp #x2000)
  971. (<= cp #x2fff)))
  972. (= cp (code-point-info-lowercase-code-point info))
  973. (or (not (= cp (code-point-info-uppercase-code-point info)))
  974. (string-contains (code-point-info-name info)
  975. "SMALL LETTER")
  976. (string-contains (code-point-info-name info)
  977. "SMALL LIGATURE")))))
  978. (define (srfi-14-upper-case? info)
  979. (let ((cp (code-point-info-code-point info)))
  980. (and (not (and (>= cp #x2000)
  981. (<= cp #x2fff)))
  982. (= cp (code-point-info-uppercase-code-point info))
  983. (or (not (= cp (code-point-info-lowercase-code-point info)))
  984. (string-contains (code-point-info-name info)
  985. "CAPITAL LETTER")
  986. (string-contains (code-point-info-name info)
  987. "CAPITAL LIGATURE")))))
  988. (define (write-srfi-14-base-char-set-definition name pred infos port)
  989. (write (srfi-14-base-char-set-definition name pred infos)
  990. port)
  991. (newline port))
  992. (define (ranges->range-vector ranges)
  993. (let* ((range-count (length ranges))
  994. (range-vector (make-vector (* 2 (length ranges)))))
  995. (let loop ((i 0) (ranges ranges))
  996. (if (< i range-count)
  997. (begin
  998. (vector-set! range-vector (* 2 i) (caar ranges))
  999. (vector-set! range-vector (+ 1 (* 2 i)) (cdar ranges))
  1000. (loop (+ 1 i) (cdr ranges)))))
  1001. range-vector))
  1002. (define (srfi-14-base-char-set-definition name pred infos)
  1003. (let ((accumulator (make-ranges-accumulator pred)))
  1004. (for-each-expanded-code-point-info accumulator infos)
  1005. `(define ,name
  1006. (range-vector->char-set
  1007. ',(ranges->range-vector (accumulator 'ranges))))))
  1008. (define (make-ranges-accumulator pred)
  1009. (let ((rev-ranges '())
  1010. (current-left #f)
  1011. (current-right #f))
  1012. ;; assumes the characters arrive with ascending scalar values
  1013. (lambda (message)
  1014. (cond
  1015. ((not (code-point-info? message))
  1016. (if current-left
  1017. (reverse (cons (cons current-left current-right)
  1018. rev-ranges))
  1019. (reverse rev-ranges)))
  1020. ((pred message)
  1021. (let ((scalar-value (code-point-info-code-point message)))
  1022. (cond
  1023. ((not current-left)
  1024. (set! current-left scalar-value)
  1025. (set! current-right (+ 1 scalar-value)))
  1026. ((= scalar-value current-right)
  1027. (set! current-right (+ 1 current-right)))
  1028. (else
  1029. (set! rev-ranges
  1030. (cons (cons current-left current-right)
  1031. rev-ranges))
  1032. (set! current-left scalar-value)
  1033. (set! current-right (+ 1 scalar-value))))))))))
  1034. (define (write-normalization-tables infos excluded port)
  1035. (call-with-values
  1036. (lambda ()
  1037. (make-normalization-encoding-tables infos))
  1038. (lambda (indices encodings)
  1039. (write `(define *normalization-info-block-bits* ,*block-bits*)
  1040. port)
  1041. (newline port)
  1042. (write `(define *normalization-info-indices* ',indices)
  1043. port)
  1044. (newline port)
  1045. (write `(define *normalization-info-encodings* ',encodings)
  1046. port)
  1047. (newline port)))
  1048. (newline port)
  1049. (let ((canonical-pairs (canonical-decomposition-pairs infos)))
  1050. (write `(define *canonical-decomposition-scalar-values*
  1051. ',(list->vector (map car canonical-pairs)))
  1052. port)
  1053. (newline port)
  1054. (write `(define *canonical-decompositions*
  1055. ',(list->vector (map cdr canonical-pairs)))
  1056. port)
  1057. (newline port))
  1058. (newline port)
  1059. (call-with-values
  1060. (lambda ()
  1061. (compatibility-decomposition-tables infos))
  1062. (lambda (decompositions scalar-values indices)
  1063. (write `(define *compatibility-decompositions* ',decompositions)
  1064. port)
  1065. (newline port)
  1066. (write `(define *compatibility-scalar-values* ',scalar-values)
  1067. port)
  1068. (newline port)
  1069. (write `(define *compatibility-indices* ',indices)
  1070. port)
  1071. (newline port)))
  1072. (newline port)
  1073. (let ((composition-pairs (composition-pairs infos excluded)))
  1074. (write `(define *composition-scalar-values*
  1075. ',(list->vector (map car composition-pairs)))
  1076. port)
  1077. (newline port)
  1078. (write `(define *composition-encodings*
  1079. ',(list->vector (map cdr composition-pairs)))
  1080. port)
  1081. (newline port)))
  1082. (define (parse-composition-exclusions filename)
  1083. (call-with-input-file filename
  1084. (lambda (port)
  1085. (let loop ((exclusions '()))
  1086. (let ((thing (read-line port)))
  1087. (cond
  1088. ((eof-object? thing) exclusions)
  1089. ((and (not (string=? "" thing))
  1090. (not (char=? #\# (string-ref thing 0))))
  1091. (let ((end (or (string-skip thing char-set:hex-digit)
  1092. (string-length thing))))
  1093. (loop
  1094. (cons (string->number (substring thing 0 end) 16)
  1095. exclusions))))
  1096. (else (loop exclusions))))))))
  1097. (define (make-normalization-encoding-tables infos)
  1098. (compute-compact-table
  1099. (make-consecutive-info-source
  1100. (expanded-code-point-info-source infos)
  1101. (lambda (code-point)
  1102. (make-code-point-info code-point
  1103. "<unassigned>"
  1104. (general-category unassigned)
  1105. 0 #f #f #f #f #f #f #f #f #f
  1106. code-point code-point code-point))
  1107. (lambda (info)
  1108. (bitwise-ior (code-point-info-combining-class info) ; 0..240
  1109. (if (code-point-info-canonical-decomposition info)
  1110. #x100
  1111. 0)
  1112. (if (code-point-info-compatibility-decomposition info)
  1113. #x200
  1114. 0))))
  1115. (expt 2 *block-bits*)))
  1116. (define (encode-canonical-decomposition l)
  1117. (cond
  1118. ((null? (cdr l))
  1119. (if (> (car l) #xffff)
  1120. l
  1121. (car l)))
  1122. (else
  1123. (let ((a (car l))
  1124. (b (cadr l)))
  1125. (if (or (> a #xffff)
  1126. (> b #xffff))
  1127. (cons a b)
  1128. (bitwise-ior (arithmetic-shift b 16) a))))))
  1129. ;; generate an alist that maps scalar values to decomposition encodings
  1130. (define (canonical-decomposition-pairs infos)
  1131. (let ((pairs '()))
  1132. (for-each-expanded-code-point-info
  1133. (lambda (info)
  1134. (cond
  1135. ((code-point-info-canonical-decomposition info)
  1136. => (lambda (d)
  1137. (set! pairs
  1138. (cons
  1139. (cons (code-point-info-code-point info)
  1140. (encode-canonical-decomposition d))
  1141. pairs))))))
  1142. infos)
  1143. (reverse pairs)))
  1144. (define (compatibility-decomposition-tables infos)
  1145. (let ((reverse-decomps '())
  1146. (decomp-index 0)
  1147. (rev-infos '()))
  1148. (for-each-expanded-code-point-info
  1149. (lambda (info)
  1150. (cond
  1151. ((code-point-info-compatibility-decomposition info)
  1152. => (lambda (d)
  1153. (let ((size (length d)))
  1154. (set! reverse-decomps
  1155. (append (reverse d) reverse-decomps))
  1156. (set! rev-infos
  1157. (cons (cons (code-point-info-code-point info)
  1158. decomp-index)
  1159. rev-infos))
  1160. (set! decomp-index (+ decomp-index size)))))))
  1161. infos)
  1162. (let ((decomps (list->vector (reverse reverse-decomps))))
  1163. (values decomps
  1164. (list->vector (map car (reverse rev-infos)))
  1165. (list->vector
  1166. (map cdr (reverse (cons (cons #f (vector-length decomps)) rev-infos))))))))
  1167. (define (composition-pairs infos excluded)
  1168. (let ((pairs '()))
  1169. (for-each-expanded-code-point-info
  1170. (lambda (info)
  1171. (cond
  1172. ((code-point-info-canonical-decomposition info)
  1173. => (lambda (d)
  1174. (if (and (pair? (cdr d)) ; not a singleton
  1175. (not (member (code-point-info-code-point info) excluded))
  1176. (code-point-info-combining-class
  1177. (find-code-point-info (car d) infos))) ; possibly expensive
  1178. (set! pairs
  1179. (cons (cons (code-point-info-code-point info)
  1180. (encode-composition d))
  1181. pairs)))))))
  1182. infos)
  1183. (sort-list pairs
  1184. (lambda (p1 p2)
  1185. (< (cdr p1) (cdr p2))))))
  1186. (define (encode-composition l)
  1187. (if (or (> (car l) #xffff)
  1188. (> (cadr l) #xffff))
  1189. (error "non-BMP composition"))
  1190. (bitwise-ior (arithmetic-shift (cadr l) 16)
  1191. (car l)))
  1192. ; for debugging
  1193. (define (test-code-point-case+general-category-encoding-tables
  1194. infos special-uppercase-table special-lowercase-table
  1195. specialcasings
  1196. indices encodings
  1197. uppercase-offsets lowercase-offsets titlecase-offsets)
  1198. (let ((lower-mask (- (arithmetic-shift 1 *block-bits*) 1))
  1199. (uppercase-index-width (bits-necessary (vector-length uppercase-offsets)))
  1200. (lowercase-index-width (bits-necessary (vector-length lowercase-offsets)))
  1201. (titlecase-index-width (bits-necessary (vector-length titlecase-offsets))))
  1202. (for-each-expanded-code-point-info
  1203. (lambda (info)
  1204. (let* ((code-point (code-point-info-code-point info))
  1205. (base-index (vector-ref indices
  1206. (arithmetic-shift code-point (- *block-bits*))))
  1207. (index (+ base-index (bitwise-and code-point lower-mask)))
  1208. (encoding (vector-ref encodings index)))
  1209. (if (not (eq? (code-point-info-general-category info)
  1210. (general-category surrogate)))
  1211. (begin
  1212. (if (not (eq? (code-point-info-general-category info)
  1213. (code-point-encoding-general-category encoding)))
  1214. (error "general category mismatch"
  1215. info
  1216. (code-point-encoding-general-category encoding)))
  1217. (let ((uppercase-code-point
  1218. (code-point-encoding-uppercase-code-point
  1219. code-point encoding
  1220. uppercase-offsets
  1221. uppercase-index-width lowercase-index-width titlecase-index-width))
  1222. (lowercase-code-point
  1223. (code-point-encoding-lowercase-code-point
  1224. code-point encoding
  1225. lowercase-offsets
  1226. uppercase-index-width lowercase-index-width titlecase-index-width))
  1227. (titlecase-code-point
  1228. (code-point-encoding-titlecase-code-point
  1229. code-point encoding
  1230. titlecase-offsets
  1231. uppercase-index-width lowercase-index-width titlecase-index-width))
  1232. (uppercase?
  1233. (code-point-encoding-uppercase?
  1234. encoding
  1235. uppercase-index-width lowercase-index-width titlecase-index-width))
  1236. (lowercase?
  1237. (code-point-encoding-lowercase?
  1238. encoding
  1239. uppercase-index-width lowercase-index-width titlecase-index-width)))
  1240. (if (not (= (code-point-info-uppercase-code-point info)
  1241. uppercase-code-point))
  1242. (error "uppercase mismatch" info uppercase-code-point))
  1243. (if (not (= (code-point-info-lowercase-code-point info)
  1244. lowercase-code-point))
  1245. (error "lowercase mismatch" info lowercase-code-point))
  1246. (if (not (= (code-point-info-titlecase-code-point info)
  1247. titlecase-code-point))
  1248. (error "titlecase mismatch" info titlecase-code-point))
  1249. (if (not (eq? (or (table-ref special-uppercase-table code-point)
  1250. (eq? (code-point-info-general-category info)
  1251. (general-category uppercase-letter)))
  1252. uppercase?))
  1253. (error "uppercase? mismatch" info code-point))
  1254. (if (not (eq? (or (table-ref special-lowercase-table code-point)
  1255. (eq? (code-point-info-general-category info)
  1256. (general-category lowercase-letter)))
  1257. lowercase?))
  1258. (error "lowercase? mismatch" info code-point))
  1259. )))))
  1260. infos)))
  1261. (define (check-unicode-tables unicode-data-filename
  1262. proplist-filename
  1263. specialcasing-filename)
  1264. (let ((infos (parse-unicode-data unicode-data-filename))
  1265. (specialcasings (parse-specialcasing specialcasing-filename)))
  1266. (call-with-values
  1267. (lambda ()
  1268. (parse-proplist-for-upper/lowercase proplist-filename))
  1269. (lambda (special-uppercase-table special-lowercase-table)
  1270. (call-with-values
  1271. (lambda ()
  1272. (make-scalar-value-case+general-category-encoding-tables
  1273. infos
  1274. special-lowercase-table special-uppercase-table
  1275. specialcasings))
  1276. (lambda (indices
  1277. encodings
  1278. uppercase-offsets lowercase-offsets titlecase-offsets)
  1279. (test-code-point-case+general-category-encoding-tables
  1280. infos special-uppercase-table special-lowercase-table
  1281. specialcasings
  1282. indices encodings
  1283. uppercase-offsets lowercase-offsets titlecase-offsets)))))))
  1284. (define (find-code-point-info code-point infos)
  1285. (call-with-current-continuation
  1286. (lambda (return)
  1287. (for-each-expanded-code-point-info
  1288. (lambda (info)
  1289. (if (= code-point (code-point-info-code-point info))
  1290. (return info)))
  1291. infos))))