unicode-data.scm 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422
  1. ; Copyright (c) 1993-2008 by Richard Kelsey and Jonathan Rees. See file COPYING.
  2. ; Copyright (c) 2005-2006 by Basis Technology Corporation. See file COPYING.
  3. ; Parse UnicodeData.txt and various other files from the Unicode
  4. ; consortium, and generate character classification and conversion
  5. ; tables from it.
  6. (define (string-split string at)
  7. (let ((count (string-length string)))
  8. (let loop ((index 0)
  9. (rev-result '()))
  10. (cond
  11. ((>= index count)
  12. (reverse (cons "" rev-result)))
  13. ((string-index string at index)
  14. => (lambda (found)
  15. (loop (+ 1 found)
  16. (cons (substring string index found)
  17. rev-result))))
  18. (else
  19. (reverse (cons (substring string index count)
  20. rev-result)))))))
  21. (define (split-unicode-data-record line)
  22. (string-split line #\;))
  23. (define (maybe-code-point text default)
  24. (if (zero? (string-length text))
  25. default
  26. (string->number text 16)))
  27. (define-record-type code-point-info :code-point-info
  28. (make-code-point-info code-point
  29. name
  30. general-category
  31. combining-class
  32. bidirectional-category-id
  33. canonical-decomposition
  34. compatibility-decomposition
  35. decimal-digit-value
  36. digit-value
  37. numeric-value
  38. mirrored?
  39. unicode-1.0-name
  40. iso-10646-comment
  41. uppercase-code-point
  42. lowercase-code-point
  43. titlecase-code-point)
  44. code-point-info?
  45. ;; number
  46. (code-point code-point-info-code-point)
  47. ;; string
  48. (name code-point-info-name)
  49. ;; :GENERAL-CATEGORY
  50. (general-category code-point-info-general-category)
  51. ;; number
  52. (combining-class code-point-info-combining-class)
  53. ;; symbol
  54. (bidirectional-category-id code-point-info-bidirectional-category-id)
  55. ;; #f or list
  56. (canonical-decomposition code-point-info-canonical-decomposition)
  57. (compatibility-decomposition code-point-info-compatibility-decomposition)
  58. ;; number
  59. (decimal-digit-value code-point-info-decimal-digit-value)
  60. ;; number
  61. (digit-value code-point-info-digit-value)
  62. ;; number
  63. (numeric-value code-point-info-numeric-value)
  64. ;; boolean
  65. (mirrored? code-point-info-mirrored?)
  66. ;; string
  67. (unicode-1.0-name code-point-info-unicode-1.0-name)
  68. ;; string
  69. (iso-10646-comment code-point-info-iso-10646-comment)
  70. ;; number
  71. (uppercase-code-point code-point-info-uppercase-code-point)
  72. ;; number
  73. (lowercase-code-point code-point-info-lowercase-code-point)
  74. ;; number
  75. (titlecase-code-point code-point-info-titlecase-code-point))
  76. (define-record-discloser :code-point-info
  77. (lambda (r)
  78. (list 'code-point-info
  79. (code-point-info-code-point r)
  80. (code-point-info-name r)
  81. (code-point-info-general-category r)
  82. (code-point-info-combining-class r)
  83. (code-point-info-bidirectional-category-id r)
  84. (code-point-info-canonical-decomposition r)
  85. (code-point-info-compatibility-decomposition r)
  86. (code-point-info-decimal-digit-value r)
  87. (code-point-info-digit-value r)
  88. (code-point-info-numeric-value r)
  89. (code-point-info-mirrored? r)
  90. (code-point-info-unicode-1.0-name r)
  91. (code-point-info-iso-10646-comment r)
  92. (code-point-info-uppercase-code-point r)
  93. (code-point-info-lowercase-code-point r)
  94. (code-point-info-titlecase-code-point r))))
  95. (define (unicode-data-record->info line)
  96. (destructure (((code-point-hex
  97. name
  98. general-category-id
  99. combining-class-id
  100. bidirectional-category-text
  101. decomposition-text
  102. decimal-digit-value-text
  103. digit-value-text
  104. numeric-value-text
  105. mirrored-y/n
  106. unicode-1.0-name
  107. iso-10646-comment
  108. uppercase-code-point-hex
  109. lowercase-code-point-hex
  110. titlecase-code-point-hex)
  111. (split-unicode-data-record line)))
  112. (let ((code-point (maybe-code-point code-point-hex #f)))
  113. (let ((uppercase-code-point (maybe-code-point uppercase-code-point-hex code-point))
  114. (lowercase-code-point (maybe-code-point lowercase-code-point-hex code-point))
  115. (titlecase-code-point (maybe-code-point titlecase-code-point-hex code-point))
  116. (decomposition (parse-decomposition decomposition-text)))
  117. (make-code-point-info code-point
  118. name
  119. (id->general-category general-category-id)
  120. (string->number combining-class-id)
  121. (string->symbol bidirectional-category-text)
  122. (and (and (pair? decomposition) (number? (car decomposition)))
  123. decomposition)
  124. (and (and (pair? decomposition) (symbol? (car decomposition)))
  125. (cdr decomposition))
  126. (string->number decimal-digit-value-text)
  127. (string->number digit-value-text)
  128. (string->number numeric-value-text)
  129. (string=? mirrored-y/n "Y")
  130. unicode-1.0-name
  131. iso-10646-comment
  132. uppercase-code-point
  133. lowercase-code-point
  134. titlecase-code-point)))))
  135. ;; return #f or a list, which contains the scalar values of the decompositon
  136. ;; for compatibility decompositions, the tag is prepended as a symbol
  137. (define (parse-decomposition d)
  138. (cond
  139. ((zero? (string-length d))
  140. #f)
  141. ((char=? #\< (string-ref d 0))
  142. (let ((after (string-index d #\space)))
  143. (cons (string->symbol (substring d 0 after))
  144. (call-with-values
  145. (lambda ()
  146. (parse-scalar-values d after))
  147. (lambda (l i) l)))))
  148. (else
  149. (call-with-values
  150. (lambda ()
  151. (parse-scalar-values d 0))
  152. (lambda (l i) l)))))
  153. ; for EXPANDED-CODE-POINT-INFO-SOURCE
  154. (define (code-point-info-with-code-point+name info code-point name)
  155. (make-code-point-info code-point
  156. name
  157. (code-point-info-general-category info)
  158. (code-point-info-combining-class info)
  159. (code-point-info-bidirectional-category-id info)
  160. (code-point-info-canonical-decomposition info)
  161. (code-point-info-compatibility-decomposition info)
  162. (code-point-info-decimal-digit-value info)
  163. (code-point-info-digit-value info)
  164. (code-point-info-numeric-value info)
  165. (code-point-info-mirrored? info)
  166. (code-point-info-unicode-1.0-name info)
  167. (code-point-info-iso-10646-comment info)
  168. code-point code-point code-point)) ; kludge
  169. ; expand the code-point ranges that look like this:
  170. ; 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
  171. ; 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
  172. ; returns a thunk that returns the infos from consecutive calls,
  173. ; then #f
  174. (define (expanded-code-point-info-source infos)
  175. (let ((first-info #f)
  176. (code-point #f)
  177. (last-code-point #f)
  178. (name-base #f))
  179. (lambda ()
  180. (let again ()
  181. (cond
  182. (first-info
  183. (if (<= code-point last-code-point)
  184. (begin
  185. (set! code-point (+ 1 code-point))
  186. (code-point-info-with-code-point+name
  187. first-info
  188. (- code-point 1)
  189. name-base)) ; kludge for speed; should be:
  190. ; (string-append name-base (number->string code-point 16))
  191. (begin
  192. (set! first-info #f)
  193. (again))))
  194. ((null? infos)
  195. #f)
  196. (else
  197. (let* ((info (car infos))
  198. (name (code-point-info-name info)))
  199. (cond
  200. ((and (string-prefix? "<" name)
  201. (string-suffix? ", First>" name))
  202. (set! first-info info)
  203. (set! code-point (code-point-info-code-point info))
  204. (set! last-code-point (code-point-info-code-point (cadr infos)))
  205. (set! name-base (string-append
  206. (substring name
  207. 1 ; (string-length "<")
  208. (- (string-length name)
  209. 8 ; (string-length ", First>")
  210. ))
  211. "-<code point>")) ; kludge, see above
  212. (set! infos (cddr infos))
  213. (again))
  214. (else
  215. (set! infos (cdr infos))
  216. info)))))))))
  217. (define (for-each-expanded-code-point-info proc infos)
  218. (let ((source (expanded-code-point-info-source infos)))
  219. (let loop ()
  220. (let ((info (source)))
  221. (if info
  222. (begin
  223. (proc info)
  224. (loop)))))))
  225. (define (read-line port)
  226. (let loop ((l '()))
  227. (let ((c (read-char port)))
  228. (if (eof-object? c)
  229. c
  230. (if (char=? c #\newline)
  231. (list->string (reverse l))
  232. (loop (cons c l)))))))
  233. (define (parse-unicode-data filename)
  234. (call-with-input-file filename
  235. (lambda (port)
  236. (let loop ((rev-infos '()))
  237. (let ((thing (read-line port)))
  238. (if (eof-object? thing)
  239. (reverse rev-infos)
  240. (loop (cons (unicode-data-record->info thing) rev-infos))))))))
  241. ; Mapping the relevant info (general category + case mappings) into a
  242. ; compact array
  243. (define (mapping-offsets infos accessor)
  244. (let loop ((infos infos)
  245. (offsets '()))
  246. (if (null? infos)
  247. (list->vector offsets)
  248. (let* ((info (car infos))
  249. (code-point (code-point-info-code-point info))
  250. (other (accessor info))
  251. (offset (- other code-point)))
  252. (if (member offset offsets)
  253. (loop (cdr infos) offsets)
  254. (loop (cdr infos) (cons offset offsets)))))))
  255. (define (vector-index vector value)
  256. (let ((count (vector-length vector)))
  257. (let loop ((i 0))
  258. (cond
  259. ((>= i count) #f)
  260. ((equal? value (vector-ref vector i)) i)
  261. (else (loop (+ 1 i)))))))
  262. (define (code-point-info->case+general-category-encoding
  263. info
  264. specialcasing?
  265. special-lowercase-table special-uppercase-table
  266. uppercase-offsets lowercase-offsets titlecase-offsets
  267. uppercase-index-width lowercase-index-width titlecase-index-width)
  268. (let ((code-point (code-point-info-code-point info)))
  269. (let ((uppercase-index (vector-index uppercase-offsets
  270. (- (code-point-info-uppercase-code-point info)
  271. code-point)))
  272. (lowercase-index (vector-index lowercase-offsets
  273. (- (code-point-info-lowercase-code-point info)
  274. code-point)))
  275. (titlecase-index (vector-index titlecase-offsets
  276. (- (code-point-info-titlecase-code-point info)
  277. code-point)))
  278. (uppercase? (or (eq? (general-category uppercase-letter)
  279. (code-point-info-general-category info))
  280. (table-ref special-uppercase-table code-point)))
  281. (lowercase? (or (eq? (general-category lowercase-letter)
  282. (code-point-info-general-category info))
  283. (table-ref special-lowercase-table code-point))))
  284. (bitwise-ior
  285. (arithmetic-shift
  286. (bitwise-ior
  287. (arithmetic-shift (bitwise-ior
  288. (arithmetic-shift
  289. (bitwise-ior
  290. (arithmetic-shift
  291. (bitwise-ior (if specialcasing? 4 0)
  292. (if uppercase? 2 0)
  293. (if lowercase? 1 0))
  294. uppercase-index-width)
  295. uppercase-index)
  296. lowercase-index-width)
  297. lowercase-index)
  298. titlecase-index-width)
  299. titlecase-index)
  300. *general-category-bits*)
  301. (general-category-index (code-point-info-general-category info))))))
  302. (define (code-point-encoding-uppercase? encoding
  303. uppercase-index-width lowercase-index-width titlecase-index-width)
  304. (not
  305. (zero?
  306. (bitwise-and 1
  307. (arithmetic-shift encoding
  308. (- (+ 1
  309. uppercase-index-width
  310. lowercase-index-width
  311. titlecase-index-width
  312. *general-category-bits*)))))))
  313. (define (code-point-encoding-lowercase? encoding
  314. uppercase-index-width lowercase-index-width titlecase-index-width)
  315. (not
  316. (zero?
  317. (bitwise-and 1
  318. (arithmetic-shift encoding
  319. (- (+ uppercase-index-width
  320. lowercase-index-width
  321. titlecase-index-width
  322. *general-category-bits*)))))))
  323. (define (lookup-by-offset-index code-point offset-index offsets)
  324. (+ code-point (vector-ref offsets offset-index)))
  325. (define (code-point-encoding-uppercase-code-point code-point encoding
  326. uppercase-offsets
  327. uppercase-index-width lowercase-index-width titlecase-index-width)
  328. (lookup-by-offset-index
  329. code-point
  330. (bitwise-and (- (arithmetic-shift 1 uppercase-index-width) 1)
  331. (arithmetic-shift encoding
  332. (- (+ lowercase-index-width titlecase-index-width *general-category-bits*))))
  333. uppercase-offsets))
  334. (define (code-point-encoding-lowercase-code-point code-point encoding
  335. lowercase-offsets
  336. uppercase-index-width lowercase-index-width titlecase-index-width)
  337. (lookup-by-offset-index
  338. code-point
  339. (bitwise-and (- (arithmetic-shift 1 lowercase-index-width) 1)
  340. (arithmetic-shift encoding
  341. (- (+ titlecase-index-width *general-category-bits*))))
  342. lowercase-offsets))
  343. (define (code-point-encoding-titlecase-code-point code-point encoding
  344. titlecase-offsets
  345. uppercase-index-width lowercase-index-width titlecase-index-width)
  346. (lookup-by-offset-index
  347. code-point
  348. (bitwise-and (- (arithmetic-shift 1 titlecase-index-width) 1)
  349. (arithmetic-shift encoding (- *general-category-bits*)))
  350. titlecase-offsets))
  351. (define *code-point-encoding-general-category-mask*
  352. (- (arithmetic-shift 1 *general-category-bits*) 1))
  353. (define (code-point-encoding-general-category encoding)
  354. (vector-ref general-categories
  355. (bitwise-and encoding *code-point-encoding-general-category-mask*)))
  356. (define (max-code-point infos)
  357. (let loop ((max 0) (infos infos))
  358. (cond
  359. ((null? infos) max)
  360. ((> (code-point-info-code-point (car infos))
  361. max)
  362. (loop (code-point-info-code-point (car infos)) (cdr infos)))
  363. (else (loop max (cdr infos))))))
  364. ; returns a THUNK that will return for each code-point in sequence
  365. ; (PROC <code-point>) or DEFAULT if there's no info.
  366. ; assumes INFOS are sorted
  367. (define (make-consecutive-info-source source make-default proc)
  368. (let ((next-info #f)
  369. (last-code-point -1))
  370. (lambda ()
  371. (define (upto info)
  372. (if (< last-code-point (code-point-info-code-point info))
  373. (begin
  374. (set! next-info info)
  375. (proc (make-default last-code-point)))
  376. (begin
  377. (set! next-info #f)
  378. ;; scalar values only
  379. (if (eq? (code-point-info-general-category info)
  380. (general-category surrogate))
  381. (proc (make-default last-code-point))
  382. (proc info)))))
  383. (set! last-code-point (+ 1 last-code-point))
  384. (cond
  385. ((or next-info (source)) => upto)
  386. (else #f)))))
  387. ; Dealing with PropList.txt
  388. (define (parse-proplist-for-upper/lowercase filename)
  389. (call-with-input-file filename
  390. (lambda (port)
  391. (let ((uppercase (make-integer-table)) (lowercase (make-integer-table)))
  392. (let loop ()
  393. (let ((thing (read-line port)))
  394. (if (eof-object? thing)
  395. (values uppercase lowercase)
  396. (call-with-values
  397. (lambda ()
  398. (extract-upper/lowercase thing))
  399. (lambda (uppers lowers)
  400. (for-each (lambda (u)
  401. (table-set! uppercase u #t))
  402. uppers)
  403. (for-each (lambda (l)
  404. (table-set! lowercase l #t))
  405. lowers)
  406. (loop))))))))))
  407. (define (extract-upper/lowercase line)
  408. (cond
  409. ((string-prefix? "#" line)
  410. (values '() '()))
  411. ((string-contains line "Other_Uppercase")
  412. (values (proplist-line-range line)
  413. '()))
  414. ((string-contains line "Other_Lowercase")
  415. (values '()
  416. (proplist-line-range line)))
  417. (else
  418. (values '() '()))))
  419. (define (proplist-line-range line)
  420. (let* ((i1 (string-skip line char-set:hex-digit))
  421. (first (string->number (substring line 0 i1) 16)))
  422. (if (char=? #\. (string-ref line i1))
  423. (let* ((i2 (string-skip line #\. i1))
  424. (i3 (string-skip line char-set:hex-digit i2))
  425. (last (string->number (substring line i2 i3) 16)))
  426. (let loop ((last last) (range '()))
  427. (if (= last first)
  428. (cons last range)
  429. (loop (- last 1) (cons last range)))))
  430. (list first))))
  431. ; assumes START points to whitespace or the first digit
  432. ; returns list of scalar values + position after sequence
  433. ; (possibly after trailing semicolon)
  434. (define (parse-scalar-values s start)
  435. (let ((size (string-length s)))
  436. (let loop ((start start) (rev-values '()))
  437. (let ((i1 (string-skip s char-set:whitespace start)))
  438. (cond
  439. ((not i1)
  440. (values (reverse rev-values) (+ start 1)))
  441. ((char=? #\; (string-ref s i1))
  442. (values (reverse rev-values) (+ i1 1)))
  443. (else
  444. (let* ((i2 (or (string-skip s char-set:hex-digit i1)
  445. size))
  446. (n (string->number (substring s i1 i2) 16)))
  447. (loop i2 (cons n rev-values)))))))))
  448. (define-record-type specialcasing :specialcasing
  449. (make-specialcasing scalar-value
  450. lowercase titlecase uppercase foldcase
  451. final-sigma?)
  452. specialcasing?
  453. (scalar-value specialcasing-scalar-value)
  454. (lowercase specialcasing-lowercase)
  455. (titlecase specialcasing-titlecase)
  456. (uppercase specialcasing-uppercase)
  457. ;; This will actually come from CaseFolding.txt
  458. (foldcase specialcasing-foldcase set-specialcasing-foldcase!)
  459. (final-sigma? specialcasing-final-sigma?))
  460. (define (parse-specialcasing-line line)
  461. (let* ((i1 (string-skip line char-set:hex-digit 0))
  462. (n (string->number (substring line 0 i1) 16)))
  463. (call-with-values
  464. (lambda () (parse-scalar-values line (+ 1 i1)))
  465. (lambda (lowercase i2)
  466. (call-with-values
  467. (lambda () (parse-scalar-values line i2))
  468. (lambda (titlecase i3)
  469. (call-with-values
  470. (lambda () (parse-scalar-values line i3))
  471. (lambda (uppercase i4)
  472. (let ((i5 (or (string-index line #\; (+ 1 i4))
  473. (string-index line #\# (+ 1 i4))
  474. (string-length line))))
  475. (let ((conditions (string-trim-both (substring line i4 i5))))
  476. (if (or (string=? "" conditions)
  477. (string=? "Final_Sigma" conditions))
  478. (make-specialcasing n
  479. lowercase titlecase uppercase #f
  480. (string=? conditions "Final_Sigma"))
  481. #f)))))))))))
  482. (define (parse-specialcasing filename)
  483. (call-with-input-file filename
  484. (lambda (port)
  485. (let loop ((specialcasings '()))
  486. (let ((thing (read-line port)))
  487. (if (eof-object? thing)
  488. specialcasings
  489. (cond
  490. ((and (not (string=? "" thing))
  491. (not (char=? #\# (string-ref thing 0)))
  492. (parse-specialcasing-line thing))
  493. => (lambda (sc)
  494. (loop (cons sc specialcasings))))
  495. (else (loop specialcasings)))))))))
  496. ; we only extract the common and full case foldings
  497. (define (parse-casefolding-line line)
  498. (let* ((i1 (string-skip line char-set:hex-digit 0))
  499. (n (string->number (substring line 0 i1) 16))
  500. (i2 (string-skip line char-set:whitespace (+ 1 i1)))
  501. (status (string-ref line i2)))
  502. (call-with-values
  503. (lambda ()
  504. (parse-scalar-values line (+ 2 i2)))
  505. (lambda (scalar-values i)
  506. (cond
  507. ((or (char=? status #\C)
  508. (char=? status #\F))
  509. (cons n (cons status scalar-values)))
  510. ((> (length scalar-values) 1)
  511. (error "multi-character common case-folding mapping"))
  512. (else #f))))))
  513. (define (parse-casefolding filename)
  514. (call-with-input-file filename
  515. (lambda (port)
  516. (let loop ((casefoldings '()))
  517. (let ((thing (read-line port)))
  518. (cond
  519. ((eof-object? thing) casefoldings)
  520. ((and (not (string=? "" thing))
  521. (not (char=? #\# (string-ref thing 0)))
  522. (parse-casefolding-line thing))
  523. => (lambda (folding)
  524. (loop (cons folding casefoldings))))
  525. (else (loop casefoldings))))))))
  526. (define (merge-specialcasings+casefoldings! specialcasings casefoldings)
  527. (for-each
  528. (lambda (casefolding)
  529. (let ((sv (car casefolding))
  530. (status (cadr casefolding))
  531. (folding (cddr casefolding)))
  532. (cond
  533. ((find (lambda (specialcasing)
  534. (= (specialcasing-scalar-value specialcasing) sv))
  535. specialcasings)
  536. => (lambda (specialcasing)
  537. (set-specialcasing-foldcase! specialcasing folding)))
  538. ((char=? status #\F) ; the others will be covered by UnicodeData.txt
  539. (let ((sv-list (list sv)))
  540. (set! specialcasings
  541. (cons
  542. (make-specialcasing sv
  543. sv-list sv-list sv-list
  544. folding
  545. #f)
  546. specialcasings)))))))
  547. casefoldings)
  548. specialcasings)
  549. (define (parse-specialcasing+casefolding specialcasing-filename casefolding-filename)
  550. (let ((specialcasings (parse-specialcasing specialcasing-filename))
  551. (casefoldings (parse-casefolding casefolding-filename)))
  552. (merge-specialcasings+casefoldings! specialcasings casefoldings)))
  553. (define (list-prefix? l1 l2)
  554. (let loop ((l1 l1) (l2 l2))
  555. (cond
  556. ((null? l1) #t)
  557. ((null? l2) #f)
  558. ((equal? (car l1) (car l2))
  559. (loop (cdr l1) (cdr l2)))
  560. (else #f))))
  561. ; We return two lists: a list of :SPECIALCASING records where the
  562. ; xxxCASE fields are replaced by (offset . length) pairs into the
  563. ; second list, which contains all the case mappings jumbled together.
  564. (define (specialcasing-encoding specialcasings)
  565. (let ((casings '()))
  566. (define (add-casing! l)
  567. (let loop ((rest casings)
  568. (index 0))
  569. (cond
  570. ((null? rest)
  571. (set! casings (append casings l))
  572. index)
  573. ((list-prefix? l rest)
  574. index)
  575. (else
  576. (loop (cdr rest) (+ 1 index))))))
  577. (define (transform-specialcasing s)
  578. (let ((lowercase (cons (add-casing! (specialcasing-lowercase s))
  579. (length (specialcasing-lowercase s))))
  580. (titlecase (cons (add-casing! (specialcasing-titlecase s))
  581. (length (specialcasing-titlecase s))))
  582. (uppercase (cons (add-casing! (specialcasing-uppercase s))
  583. (length (specialcasing-uppercase s))))
  584. (foldcase (cons (add-casing! (specialcasing-foldcase s))
  585. (length (specialcasing-foldcase s)))))
  586. (make-specialcasing (specialcasing-scalar-value s)
  587. lowercase titlecase uppercase foldcase
  588. (specialcasing-final-sigma? s))))
  589. (let ((transformed
  590. (map transform-specialcasing specialcasings)))
  591. (values transformed
  592. casings))))
  593. (define (specialcasing-encoding-ref casings offset size)
  594. (let loop ((i 0) (r '()))
  595. (if (>= i size)
  596. (reverse r)
  597. (loop (+ 1 i)
  598. (cons (vector-ref casings (+ offset i))
  599. r)))))
  600. ; for testing
  601. (define (check-specialcasing-encodings specialcasings)
  602. (call-with-values
  603. (lambda () (specialcasing-encoding specialcasings))
  604. (lambda (encodings casings)
  605. (let ((casings (list->vector casings)))
  606. (for-each
  607. (lambda (specialcasing encoding)
  608. (define (check select)
  609. (let ((pair (select encoding))
  610. (reference (select specialcasing)))
  611. (if (not
  612. (equal? reference
  613. (specialcasing-encoding-ref casings
  614. (car pair) (cdr pair))))
  615. (error "encoding failure" encoding
  616. reference (specialcasing-encoding-ref casings
  617. (car pair) (cdr pair))))))
  618. (check specialcasing-lowercase)
  619. (check specialcasing-uppercase)
  620. (check specialcasing-titlecase)
  621. (check specialcasing-foldcase))
  622. specialcasings encodings)))))
  623. (define (specialcasings->table specialcasings)
  624. (let ((table (make-integer-table)))
  625. (for-each (lambda (s)
  626. (table-set! table (specialcasing-scalar-value s)
  627. s))
  628. specialcasings)
  629. table))
  630. (define (make-scalar-value-case+general-category-encoding-tables
  631. infos
  632. special-lowercase-table special-uppercase-table
  633. specialcasings)
  634. (let ((uppercase-offsets (mapping-offsets infos code-point-info-uppercase-code-point))
  635. (lowercase-offsets (mapping-offsets infos code-point-info-lowercase-code-point))
  636. (titlecase-offsets (mapping-offsets infos code-point-info-titlecase-code-point)))
  637. (let ((uppercase-index-width (bits-necessary (vector-length uppercase-offsets)))
  638. (lowercase-index-width (bits-necessary (vector-length lowercase-offsets)))
  639. (titlecase-index-width (bits-necessary (vector-length titlecase-offsets)))
  640. (specialcasings-table (specialcasings->table specialcasings))
  641. (block-size (expt 2 *block-bits*)))
  642. (call-with-values
  643. (lambda ()
  644. (compute-compact-table
  645. (make-consecutive-info-source
  646. (expanded-code-point-info-source infos)
  647. (lambda (code-point)
  648. (make-code-point-info code-point
  649. "<unassigned>"
  650. (general-category unassigned)
  651. #f #f #f #f #f #f #f #f #f #f
  652. code-point code-point code-point))
  653. (lambda (info)
  654. (code-point-info->case+general-category-encoding
  655. info
  656. (table-ref specialcasings-table
  657. (code-point-info-code-point info))
  658. special-lowercase-table special-uppercase-table
  659. uppercase-offsets lowercase-offsets titlecase-offsets
  660. uppercase-index-width lowercase-index-width titlecase-index-width)))
  661. block-size))
  662. (lambda (indices encodings)
  663. (values indices encodings
  664. uppercase-offsets lowercase-offsets titlecase-offsets))))))
  665. ; saves a couple of kilobyes, but probably not worthwhile
  666. (define (write-vector-code/rll name vector port)
  667. (write `(define ,name (make-vector ,(vector-length vector)))
  668. port)
  669. (newline port)
  670. (let loop ((values (vector->list vector))
  671. (index 0))
  672. (cond
  673. ((null? values))
  674. ((or (null? (cdr values))
  675. (not (equal? (car values) (cadr values))))
  676. (write `(vector-set! ,name ,index ,(car values))
  677. port)
  678. (newline port)
  679. (loop (cdr values) (+ 1 index)))
  680. (else
  681. (let ((value (car values)))
  682. (let inner-loop ((values values)
  683. (last-index index))
  684. (cond
  685. ((or (null? values)
  686. (not (equal? (car values) value)))
  687. (write
  688. `(do ((i ,index (+ 1 i)))
  689. ((>= i ,last-index))
  690. (vector-set! ,name i ,value))
  691. port)
  692. (newline port)
  693. (loop values last-index))
  694. (else
  695. (inner-loop (cdr values) (+ 1 last-index))))))))))
  696. (define (create-unicode-tables unicode-data-filename
  697. proplist-filename
  698. specialcasing-filename
  699. casefolding-filename
  700. composition-exclusions-filename
  701. category-output-file
  702. syntax-info-output-file
  703. normalization-output-file
  704. srfi-14-base-output-file)
  705. (let ((infos (parse-unicode-data unicode-data-filename))
  706. (specialcasings (parse-specialcasing+casefolding specialcasing-filename
  707. casefolding-filename)))
  708. (call-with-values
  709. (lambda ()
  710. (parse-proplist-for-upper/lowercase proplist-filename))
  711. (lambda (special-uppercase-table special-lowercase-table)
  712. (call-with-output-file category-output-file
  713. (lambda (port)
  714. (display "; Automatically generated by WRITE-UNICODE-CATEGORY-TABLES; do not edit."
  715. port)
  716. (newline port)
  717. (newline port)
  718. (write-unicode-category-tables infos
  719. special-uppercase-table special-lowercase-table
  720. specialcasings
  721. port)
  722. (write-specialcasings-tables specialcasings port)))
  723. (call-with-output-file syntax-info-output-file
  724. (lambda (port)
  725. (display "; Automatically generated by WRITE-UNICODE-CATEGORY-TABLES; do not edit."
  726. port)
  727. (newline port)
  728. (newline port)
  729. (write-syntax-info infos port)
  730. (newline port)))
  731. (write-srfi-14-base-char-sets infos srfi-14-base-output-file)
  732. (call-with-output-file normalization-output-file
  733. (lambda (port)
  734. (display "; Automatically generated by WRITE-UNICODE-CATEGORY-TABLES; do not edit."
  735. port)
  736. (newline port)
  737. (newline port)
  738. (write-normalization-tables
  739. infos
  740. (parse-composition-exclusions composition-exclusions-filename)
  741. port)))))))
  742. (define *block-bits* 8) ; better than 9, at least
  743. (define (write-unicode-category-tables infos
  744. special-uppercase-table special-lowercase-table
  745. specialcasings
  746. port)
  747. (call-with-values
  748. (lambda ()
  749. (make-scalar-value-case+general-category-encoding-tables
  750. infos
  751. special-lowercase-table special-uppercase-table
  752. specialcasings))
  753. (lambda (indices
  754. encodings
  755. uppercase-offsets lowercase-offsets titlecase-offsets)
  756. (write `(define *encoding-table-block-bits* ,*block-bits*)
  757. port)
  758. (newline port)
  759. (newline port)
  760. (write `(define *uppercase-index-width*
  761. ,(bits-necessary (vector-length uppercase-offsets)))
  762. port)
  763. (newline port)
  764. (write `(define *lowercase-index-width*
  765. ,(bits-necessary (vector-length lowercase-offsets)))
  766. port)
  767. (newline port)
  768. (write `(define *titlecase-index-width*
  769. ,(bits-necessary (vector-length titlecase-offsets)))
  770. port)
  771. (newline port)
  772. (newline port)
  773. (write `(define *scalar-value-info-indices* ',indices)
  774. port)
  775. (newline port)
  776. (write `(define *scalar-value-info-encodings* ',encodings)
  777. port)
  778. (newline port)
  779. (newline port)
  780. (write `(define *uppercase-offsets* ',uppercase-offsets)
  781. port)
  782. (newline port)
  783. (write `(define *lowercase-offsets* ',lowercase-offsets)
  784. port)
  785. (newline port)
  786. (write `(define *titlecase-offsets* ',titlecase-offsets)
  787. port)
  788. (newline port)
  789. (newline port))))
  790. (define (write-specialcasings-tables specialcasings port)
  791. (call-with-values
  792. (lambda () (specialcasing-encoding specialcasings))
  793. (lambda (encodings casings)
  794. ;; we write it out here to avoid introducing yet another file
  795. ;; into the UNICODE-CHAR-MAPS package
  796. (write
  797. '(define-record-type specialcasing :specialcasing
  798. (make-specialcasing scalar-value
  799. lowercase-start lowercase-length
  800. titlecase-start titlecase-length
  801. uppercase-start uppercase-length
  802. foldcase-start foldcase-length
  803. final-sigma?)
  804. specialcasing?
  805. (scalar-value specialcasing-scalar-value)
  806. (lowercase-start specialcasing-lowercase-start)
  807. (lowercase-length specialcasing-lowercase-length)
  808. (titlecase-start specialcasing-titlecase-start)
  809. (titlecase-length specialcasing-titlecase-length)
  810. (uppercase-start specialcasing-uppercase-start)
  811. (uppercase-length specialcasing-uppercase-length)
  812. (foldcase-start specialcasing-foldcase-start)
  813. (foldcase-length specialcasing-foldcase-length)
  814. (final-sigma? specialcasing-final-sigma?))
  815. port)
  816. (newline port)
  817. (newline port)
  818. (write `(define *specialcasing-table* (make-integer-table)) port)
  819. (newline port)
  820. (newline port)
  821. (for-each
  822. (lambda (c)
  823. (write
  824. `(table-set! *specialcasing-table*
  825. ,(specialcasing-scalar-value c)
  826. (make-specialcasing
  827. ,(specialcasing-scalar-value c)
  828. ,(car (specialcasing-lowercase c))
  829. ,(cdr (specialcasing-lowercase c))
  830. ,(car (specialcasing-titlecase c))
  831. ,(cdr (specialcasing-titlecase c))
  832. ,(car (specialcasing-uppercase c))
  833. ,(cdr (specialcasing-uppercase c))
  834. ,(car (specialcasing-foldcase c))
  835. ,(cdr (specialcasing-foldcase c))
  836. ,(specialcasing-final-sigma? c)))
  837. port)
  838. (newline port))
  839. encodings)
  840. (newline port)
  841. (write `(define *specialcasings* (list->string (map scalar-value->char ',casings))) port)
  842. (newline port)
  843. (newline port))))
  844. ;; Read syntax
  845. (define (write-syntax-info infos port)
  846. (write `(define *non-symbol-constituents-above-127*
  847. ',(list->vector (non-symbol-constituents-above-127 infos)))
  848. port)
  849. (newline port)
  850. (newline port)
  851. (write `(define *whitespaces*
  852. ',(list->vector (whitespaces infos)))
  853. port)
  854. (newline port))
  855. (define *symbol-constituent-general-categories*
  856. (list (general-category uppercase-letter)
  857. (general-category lowercase-letter)
  858. (general-category titlecase-letter)
  859. (general-category modified-letter)
  860. (general-category other-letter)
  861. (general-category non-spacing-mark)
  862. (general-category combining-spacing-mark)
  863. (general-category enclosing-mark)
  864. (general-category decimal-digit-number)
  865. (general-category letter-number)
  866. (general-category other-number)
  867. (general-category dash-punctuation)
  868. (general-category connector-punctuation)
  869. (general-category other-punctuation)
  870. (general-category currency-symbol)
  871. (general-category mathematical-symbol)
  872. (general-category modifier-symbol)
  873. (general-category other-symbol)
  874. (general-category private-use-character)))
  875. (define (symbol-constituent-above-127? info)
  876. (memq (code-point-info-general-category info)
  877. *symbol-constituent-general-categories*))
  878. (define (non-symbol-constituents-above-127 infos)
  879. (let ((reverse-non-constituents '()))
  880. (for-each-expanded-code-point-info
  881. (lambda (info)
  882. (let ((cp (code-point-info-code-point info)))
  883. (if (and (> cp 127)
  884. (not (eq? (general-category surrogate)
  885. (code-point-info-general-category info)))
  886. (not (symbol-constituent-above-127? info)))
  887. (set! reverse-non-constituents
  888. (cons cp reverse-non-constituents)))))
  889. infos)
  890. (reverse reverse-non-constituents)))
  891. (define (whitespaces infos)
  892. (let ((reverse-whitespaces '()))
  893. (for-each-expanded-code-point-info
  894. (lambda (info)
  895. (if (eq? (general-category-primary-category
  896. (code-point-info-general-category info))
  897. (primary-category separator))
  898. (set! reverse-whitespaces
  899. (cons (code-point-info-code-point info)
  900. reverse-whitespaces))))
  901. infos)
  902. (append '(#x009 #x00a #x00b #x00c #x00d)
  903. (reverse reverse-whitespaces))))
  904. (define (write-srfi-14-base-char-sets infos output-file)
  905. (call-with-output-file output-file
  906. (lambda (port)
  907. (display "; Automatically generated by WRITE-SRFI-14-BASE-CHAR-SETS; do not edit."
  908. port)
  909. (newline port)
  910. (newline port)
  911. (let-syntax
  912. ((general-category-predicate
  913. (syntax-rules ()
  914. ((general-category-predicate ?name)
  915. (lambda (info)
  916. (eq? (code-point-info-general-category info)
  917. (general-category ?name))))))
  918. (primary-category-predicate
  919. (syntax-rules ()
  920. ((primary-category-predicate ?name)
  921. (lambda (info)
  922. (eq? (general-category-primary-category
  923. (code-point-info-general-category info))
  924. (primary-category ?name)))))))
  925. (write-srfi-14-base-char-set-definition
  926. 'char-set:lower-case
  927. srfi-14-lower-case?
  928. infos port)
  929. (write-srfi-14-base-char-set-definition
  930. 'char-set:upper-case
  931. srfi-14-upper-case?
  932. infos port)
  933. (write-srfi-14-base-char-set-definition
  934. 'char-set:title-case
  935. (general-category-predicate titlecase-letter)
  936. infos port)
  937. (write-srfi-14-base-char-set-definition
  938. 'char-set:letter
  939. (primary-category-predicate letter)
  940. infos port)
  941. (write-srfi-14-base-char-set-definition
  942. 'char-set:digit
  943. (general-category-predicate decimal-digit-number)
  944. infos port)
  945. (write-srfi-14-base-char-set-definition
  946. 'char-set:mark
  947. (primary-category-predicate mark)
  948. infos port)
  949. (write-srfi-14-base-char-set-definition
  950. 'char-set:separator
  951. (primary-category-predicate separator)
  952. infos port)
  953. (write-srfi-14-base-char-set-definition
  954. 'char-set:punctuation
  955. (primary-category-predicate punctuation)
  956. infos port)
  957. (write-srfi-14-base-char-set-definition
  958. 'char-set:symbol
  959. (primary-category-predicate symbol)
  960. infos port)
  961. (write-srfi-14-base-char-set-definition
  962. 'char-set:space-separator
  963. (general-category-predicate space-separator)
  964. infos port)))))
  965. ; SRFI 14 has funny notions of lower case and upper case
  966. (define (srfi-14-lower-case? info)
  967. (let ((cp (code-point-info-code-point info)))
  968. (and (not (and (>= cp #x2000)
  969. (<= cp #x2fff)))
  970. (= cp (code-point-info-lowercase-code-point info))
  971. (or (not (= cp (code-point-info-uppercase-code-point info)))
  972. (string-contains (code-point-info-name info)
  973. "SMALL LETTER")
  974. (string-contains (code-point-info-name info)
  975. "SMALL LIGATURE")))))
  976. (define (srfi-14-upper-case? info)
  977. (let ((cp (code-point-info-code-point info)))
  978. (and (not (and (>= cp #x2000)
  979. (<= cp #x2fff)))
  980. (= cp (code-point-info-uppercase-code-point info))
  981. (or (not (= cp (code-point-info-lowercase-code-point info)))
  982. (string-contains (code-point-info-name info)
  983. "CAPITAL LETTER")
  984. (string-contains (code-point-info-name info)
  985. "CAPITAL LIGATURE")))))
  986. (define (write-srfi-14-base-char-set-definition name pred infos port)
  987. (write (srfi-14-base-char-set-definition name pred infos)
  988. port)
  989. (newline port))
  990. (define (ranges->range-vector ranges)
  991. (let* ((range-count (length ranges))
  992. (range-vector (make-vector (* 2 (length ranges)))))
  993. (let loop ((i 0) (ranges ranges))
  994. (if (< i range-count)
  995. (begin
  996. (vector-set! range-vector (* 2 i) (caar ranges))
  997. (vector-set! range-vector (+ 1 (* 2 i)) (cdar ranges))
  998. (loop (+ 1 i) (cdr ranges)))))
  999. range-vector))
  1000. (define (srfi-14-base-char-set-definition name pred infos)
  1001. (let ((accumulator (make-ranges-accumulator pred)))
  1002. (for-each-expanded-code-point-info accumulator infos)
  1003. `(define ,name
  1004. (range-vector->char-set
  1005. ',(ranges->range-vector (accumulator 'ranges))))))
  1006. (define (make-ranges-accumulator pred)
  1007. (let ((rev-ranges '())
  1008. (current-left #f)
  1009. (current-right #f))
  1010. ;; assumes the characters arrive with ascending scalar values
  1011. (lambda (message)
  1012. (cond
  1013. ((not (code-point-info? message))
  1014. (if current-left
  1015. (reverse (cons (cons current-left current-right)
  1016. rev-ranges))
  1017. (reverse rev-ranges)))
  1018. ((pred message)
  1019. (let ((scalar-value (code-point-info-code-point message)))
  1020. (cond
  1021. ((not current-left)
  1022. (set! current-left scalar-value)
  1023. (set! current-right (+ 1 scalar-value)))
  1024. ((= scalar-value current-right)
  1025. (set! current-right (+ 1 current-right)))
  1026. (else
  1027. (set! rev-ranges
  1028. (cons (cons current-left current-right)
  1029. rev-ranges))
  1030. (set! current-left scalar-value)
  1031. (set! current-right (+ 1 scalar-value))))))))))
  1032. (define (write-normalization-tables infos excluded port)
  1033. (call-with-values
  1034. (lambda ()
  1035. (make-normalization-encoding-tables infos))
  1036. (lambda (indices encodings)
  1037. (write `(define *normalization-info-block-bits* ,*block-bits*)
  1038. port)
  1039. (newline port)
  1040. (write `(define *normalization-info-indices* ',indices)
  1041. port)
  1042. (newline port)
  1043. (write `(define *normalization-info-encodings* ',encodings)
  1044. port)
  1045. (newline port)))
  1046. (newline port)
  1047. (let ((canonical-pairs (canonical-decomposition-pairs infos)))
  1048. (write `(define *canonical-decomposition-scalar-values*
  1049. ',(list->vector (map car canonical-pairs)))
  1050. port)
  1051. (newline port)
  1052. (write `(define *canonical-decompositions*
  1053. ',(list->vector (map cdr canonical-pairs)))
  1054. port)
  1055. (newline port))
  1056. (newline port)
  1057. (call-with-values
  1058. (lambda ()
  1059. (compatibility-decomposition-tables infos))
  1060. (lambda (decompositions scalar-values indices)
  1061. (write `(define *compatibility-decompositions* ',decompositions)
  1062. port)
  1063. (newline port)
  1064. (write `(define *compatibility-scalar-values* ',scalar-values)
  1065. port)
  1066. (newline port)
  1067. (write `(define *compatibility-indices* ',indices)
  1068. port)
  1069. (newline port)))
  1070. (newline port)
  1071. (let ((composition-pairs (composition-pairs infos excluded)))
  1072. (write `(define *composition-scalar-values*
  1073. ',(list->vector (map car composition-pairs)))
  1074. port)
  1075. (newline port)
  1076. (write `(define *composition-encodings*
  1077. ',(list->vector (map cdr composition-pairs)))
  1078. port)
  1079. (newline port)))
  1080. (define (parse-composition-exclusions filename)
  1081. (call-with-input-file filename
  1082. (lambda (port)
  1083. (let loop ((exclusions '()))
  1084. (let ((thing (read-line port)))
  1085. (cond
  1086. ((eof-object? thing) exclusions)
  1087. ((and (not (string=? "" thing))
  1088. (not (char=? #\# (string-ref thing 0))))
  1089. (let ((end (or (string-skip thing char-set:hex-digit)
  1090. (string-length thing))))
  1091. (loop
  1092. (cons (string->number (substring thing 0 end) 16)
  1093. exclusions))))
  1094. (else (loop exclusions))))))))
  1095. (define (make-normalization-encoding-tables infos)
  1096. (compute-compact-table
  1097. (make-consecutive-info-source
  1098. (expanded-code-point-info-source infos)
  1099. (lambda (code-point)
  1100. (make-code-point-info code-point
  1101. "<unassigned>"
  1102. (general-category unassigned)
  1103. 0 #f #f #f #f #f #f #f #f #f
  1104. code-point code-point code-point))
  1105. (lambda (info)
  1106. (bitwise-ior (code-point-info-combining-class info) ; 0..240
  1107. (if (code-point-info-canonical-decomposition info)
  1108. #x100
  1109. 0)
  1110. (if (code-point-info-compatibility-decomposition info)
  1111. #x200
  1112. 0))))
  1113. (expt 2 *block-bits*)))
  1114. (define (encode-canonical-decomposition l)
  1115. (cond
  1116. ((null? (cdr l))
  1117. (if (> (car l) #xffff)
  1118. l
  1119. (car l)))
  1120. (else
  1121. (let ((a (car l))
  1122. (b (cadr l)))
  1123. (if (or (> a #xffff)
  1124. (> b #xffff))
  1125. (cons a b)
  1126. (bitwise-ior (arithmetic-shift b 16) a))))))
  1127. ;; generate an alist that maps scalar values to decomposition encodings
  1128. (define (canonical-decomposition-pairs infos)
  1129. (let ((pairs '()))
  1130. (for-each-expanded-code-point-info
  1131. (lambda (info)
  1132. (cond
  1133. ((code-point-info-canonical-decomposition info)
  1134. => (lambda (d)
  1135. (set! pairs
  1136. (cons
  1137. (cons (code-point-info-code-point info)
  1138. (encode-canonical-decomposition d))
  1139. pairs))))))
  1140. infos)
  1141. (reverse pairs)))
  1142. (define (compatibility-decomposition-tables infos)
  1143. (let ((reverse-decomps '())
  1144. (decomp-index 0)
  1145. (rev-infos '()))
  1146. (for-each-expanded-code-point-info
  1147. (lambda (info)
  1148. (cond
  1149. ((code-point-info-compatibility-decomposition info)
  1150. => (lambda (d)
  1151. (let ((size (length d)))
  1152. (set! reverse-decomps
  1153. (append (reverse d) reverse-decomps))
  1154. (set! rev-infos
  1155. (cons (cons (code-point-info-code-point info)
  1156. decomp-index)
  1157. rev-infos))
  1158. (set! decomp-index (+ decomp-index size)))))))
  1159. infos)
  1160. (let ((decomps (list->vector (reverse reverse-decomps))))
  1161. (values decomps
  1162. (list->vector (map car (reverse rev-infos)))
  1163. (list->vector
  1164. (map cdr (reverse (cons (cons #f (vector-length decomps)) rev-infos))))))))
  1165. (define (composition-pairs infos excluded)
  1166. (let ((pairs '()))
  1167. (for-each-expanded-code-point-info
  1168. (lambda (info)
  1169. (cond
  1170. ((code-point-info-canonical-decomposition info)
  1171. => (lambda (d)
  1172. (if (and (pair? (cdr d)) ; not a singleton
  1173. (not (member (code-point-info-code-point info) excluded))
  1174. (code-point-info-combining-class
  1175. (find-code-point-info (car d) infos))) ; possibly expensive
  1176. (set! pairs
  1177. (cons (cons (code-point-info-code-point info)
  1178. (encode-composition d))
  1179. pairs)))))))
  1180. infos)
  1181. (sort-list pairs
  1182. (lambda (p1 p2)
  1183. (< (cdr p1) (cdr p2))))))
  1184. (define (encode-composition l)
  1185. (if (or (> (car l) #xffff)
  1186. (> (cadr l) #xffff))
  1187. (error "non-BMP composition"))
  1188. (bitwise-ior (arithmetic-shift (cadr l) 16)
  1189. (car l)))
  1190. ; for debugging
  1191. (define (test-code-point-case+general-category-encoding-tables
  1192. infos special-uppercase-table special-lowercase-table
  1193. specialcasings
  1194. indices encodings
  1195. uppercase-offsets lowercase-offsets titlecase-offsets)
  1196. (let ((lower-mask (- (arithmetic-shift 1 *block-bits*) 1))
  1197. (uppercase-index-width (bits-necessary (vector-length uppercase-offsets)))
  1198. (lowercase-index-width (bits-necessary (vector-length lowercase-offsets)))
  1199. (titlecase-index-width (bits-necessary (vector-length titlecase-offsets))))
  1200. (for-each-expanded-code-point-info
  1201. (lambda (info)
  1202. (let* ((code-point (code-point-info-code-point info))
  1203. (base-index (vector-ref indices
  1204. (arithmetic-shift code-point (- *block-bits*))))
  1205. (index (+ base-index (bitwise-and code-point lower-mask)))
  1206. (encoding (vector-ref encodings index)))
  1207. (if (not (eq? (code-point-info-general-category info)
  1208. (general-category surrogate)))
  1209. (begin
  1210. (if (not (eq? (code-point-info-general-category info)
  1211. (code-point-encoding-general-category encoding)))
  1212. (error "general category mismatch"
  1213. info
  1214. (code-point-encoding-general-category encoding)))
  1215. (let ((uppercase-code-point
  1216. (code-point-encoding-uppercase-code-point
  1217. code-point encoding
  1218. uppercase-offsets
  1219. uppercase-index-width lowercase-index-width titlecase-index-width))
  1220. (lowercase-code-point
  1221. (code-point-encoding-lowercase-code-point
  1222. code-point encoding
  1223. lowercase-offsets
  1224. uppercase-index-width lowercase-index-width titlecase-index-width))
  1225. (titlecase-code-point
  1226. (code-point-encoding-titlecase-code-point
  1227. code-point encoding
  1228. titlecase-offsets
  1229. uppercase-index-width lowercase-index-width titlecase-index-width))
  1230. (uppercase?
  1231. (code-point-encoding-uppercase?
  1232. encoding
  1233. uppercase-index-width lowercase-index-width titlecase-index-width))
  1234. (lowercase?
  1235. (code-point-encoding-lowercase?
  1236. encoding
  1237. uppercase-index-width lowercase-index-width titlecase-index-width)))
  1238. (if (not (= (code-point-info-uppercase-code-point info)
  1239. uppercase-code-point))
  1240. (error "uppercase mismatch" info uppercase-code-point))
  1241. (if (not (= (code-point-info-lowercase-code-point info)
  1242. lowercase-code-point))
  1243. (error "lowercase mismatch" info lowercase-code-point))
  1244. (if (not (= (code-point-info-titlecase-code-point info)
  1245. titlecase-code-point))
  1246. (error "titlecase mismatch" info titlecase-code-point))
  1247. (if (not (eq? (or (table-ref special-uppercase-table code-point)
  1248. (eq? (code-point-info-general-category info)
  1249. (general-category uppercase-letter)))
  1250. uppercase?))
  1251. (error "uppercase? mismatch" info code-point))
  1252. (if (not (eq? (or (table-ref special-lowercase-table code-point)
  1253. (eq? (code-point-info-general-category info)
  1254. (general-category lowercase-letter)))
  1255. lowercase?))
  1256. (error "lowercase? mismatch" info code-point))
  1257. )))))
  1258. infos)))
  1259. (define (check-unicode-tables unicode-data-filename
  1260. proplist-filename
  1261. specialcasing-filename)
  1262. (let ((infos (parse-unicode-data unicode-data-filename))
  1263. (specialcasings (parse-specialcasing specialcasing-filename)))
  1264. (call-with-values
  1265. (lambda ()
  1266. (parse-proplist-for-upper/lowercase proplist-filename))
  1267. (lambda (special-uppercase-table special-lowercase-table)
  1268. (call-with-values
  1269. (lambda ()
  1270. (make-scalar-value-case+general-category-encoding-tables
  1271. infos
  1272. special-lowercase-table special-uppercase-table
  1273. specialcasings))
  1274. (lambda (indices
  1275. encodings
  1276. uppercase-offsets lowercase-offsets titlecase-offsets)
  1277. (test-code-point-case+general-category-encoding-tables
  1278. infos special-uppercase-table special-lowercase-table
  1279. specialcasings
  1280. indices encodings
  1281. uppercase-offsets lowercase-offsets titlecase-offsets)))))))
  1282. (define (find-code-point-info code-point infos)
  1283. (call-with-current-continuation
  1284. (lambda (return)
  1285. (for-each-expanded-code-point-info
  1286. (lambda (info)
  1287. (if (= code-point (code-point-info-code-point info))
  1288. (return info)))
  1289. infos))))