unicode-normalization.scm 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. ; Copyright (c) 1993-2008 by Richard Kelsey and Jonathan Rees. See file COPYING.
  2. ; returns index of value (must be number) in vector
  3. (define (binary-search vec val)
  4. (let ((size (vector-length vec)))
  5. (let loop ((low 0) ; inclusive
  6. (high size)) ; exclusive
  7. (cond
  8. ((< low (- high 1))
  9. (let* ((pos (quotient (+ low high) 2)) ; always in
  10. (at (vector-ref vec pos)))
  11. (cond
  12. ((= val at) pos)
  13. ((< val at)
  14. (loop low pos))
  15. (else
  16. (loop pos high)))))
  17. ((< low high)
  18. (if (= val (vector-ref vec low))
  19. low
  20. #f))
  21. (else #f)))))
  22. (define *normalization-info-block-mask*
  23. (- (arithmetic-shift 1 *normalization-info-block-bits*) 1))
  24. (define (sv-normalization-info s)
  25. (vector-ref *normalization-info-encodings*
  26. (+ (vector-ref *normalization-info-indices*
  27. (arithmetic-shift s (- *normalization-info-block-bits*)))
  28. (bitwise-and s
  29. *normalization-info-block-mask*))))
  30. (define (sv-combining-class s)
  31. (bitwise-and (sv-normalization-info s) #xff))
  32. (define (sv-has-canonical-decomposition? s)
  33. (not (zero? (bitwise-and (sv-normalization-info s) #x100))))
  34. (define (sv-has-compatibility-decomposition? s)
  35. (not (zero? (bitwise-and (sv-normalization-info s) #x200))))
  36. ;; Hangul constants
  37. ;; from Unicode Standard Annex #15
  38. (define jamo-syllable-start #xAC00)
  39. (define jamo-initial-consonant-start #x1100)
  40. (define jamo-initial-consonant-count 19)
  41. (define jamo-initial-consonant-end (+ jamo-initial-consonant-start jamo-initial-consonant-count))
  42. (define jamo-trailing-consonant-start #x11A7)
  43. (define jamo-trailing-consonant-count 28)
  44. (define jamo-trailing-consonant-end (+ jamo-trailing-consonant-start jamo-trailing-consonant-count))
  45. (define jamo-vowel-start #x1161)
  46. (define jamo-vowel-count 21)
  47. (define jamo-vowel-end (+ jamo-vowel-start jamo-vowel-count))
  48. ;; number of syllables with a given initial consonant
  49. (define jamo-syllable-per-count
  50. (* jamo-vowel-count jamo-trailing-consonant-count))
  51. (define jamo-syllable-count
  52. (* jamo-initial-consonant-count jamo-syllable-per-count))
  53. (define jamo-syllable-end (+ jamo-syllable-start jamo-syllable-count))
  54. (define (sv-jamo-initial-consonant? sv)
  55. (and (>= sv jamo-initial-consonant-start)
  56. (< sv jamo-initial-consonant-end)))
  57. (define (sv-jamo-trailing-consonant? sv)
  58. (and (>= sv jamo-trailing-consonant-start)
  59. (< sv jamo-trailing-consonant-end)))
  60. (define (sv-jamo-vowel? sv)
  61. (and (>= sv jamo-vowel-start)
  62. (< sv jamo-vowel-end)))
  63. ;; assumes SV-HAS-CANONICAL-DECOMPOSITION? has returned #t
  64. (define (sv-canonical-decomposition-encoding s)
  65. (vector-ref *canonical-decompositions*
  66. (binary-search *canonical-decomposition-scalar-values* s)))
  67. (define (string-normalize-nfd s)
  68. (decompose #f s))
  69. (define (string-normalize-nfkd s)
  70. (decompose #t s))
  71. (define (decompose compat? s)
  72. (let ((size (string-length s)))
  73. (let loop ((i 0)
  74. (rev-chars '()))
  75. (if (>= i size)
  76. (reorder-according-to-combining-class!
  77. (list->string (reverse rev-chars)))
  78. (let* ((c (string-ref s i))
  79. (sv (char->scalar-value c)))
  80. (if (sv-hangul-syllable? sv)
  81. (loop (+ 1 i)
  82. (prepend-reverse-jamo-decomposition sv rev-chars))
  83. (loop (+ 1 i)
  84. (append (reverse-decomposition compat? sv) rev-chars))))))))
  85. (define (sv-hangul-syllable? sv)
  86. (and (>= sv jamo-syllable-start)
  87. (< sv jamo-syllable-end)))
  88. (define (prepend-reverse-jamo-decomposition sv rev-chars)
  89. (let* ((offset (- sv jamo-syllable-start))
  90. (l (+ jamo-initial-consonant-start
  91. (quotient offset jamo-syllable-per-count)))
  92. (v (+ jamo-vowel-start
  93. (quotient (modulo offset jamo-syllable-per-count)
  94. jamo-trailing-consonant-count)))
  95. (t (+ jamo-trailing-consonant-start
  96. (modulo offset jamo-trailing-consonant-count)))
  97. (either-way
  98. (cons (scalar-value->char v)
  99. (cons (scalar-value->char l)
  100. rev-chars))))
  101. (if (= t jamo-trailing-consonant-start)
  102. either-way
  103. (cons (scalar-value->char t) either-way))))
  104. (define (reverse-decomposition compat? sv)
  105. (let recur ((sv sv))
  106. (cond
  107. ((and compat? (sv-has-compatibility-decomposition? sv))
  108. (let* ((pos (binary-search *compatibility-scalar-values* sv))
  109. (end (vector-ref *compatibility-indices* (+ pos 1))))
  110. (let loop ((index (vector-ref *compatibility-indices* pos))
  111. (rev '()))
  112. (if (>= index end)
  113. rev
  114. (loop (+ 1 index)
  115. (append (recur (vector-ref *compatibility-decompositions* index))
  116. rev))))))
  117. ((sv-has-canonical-decomposition? sv)
  118. (let ((enc (sv-canonical-decomposition-encoding sv)))
  119. (cond
  120. ;; it's either a number with one or two concatenated 16-bit numbers from
  121. ;; the BMP
  122. ((number? enc)
  123. (let ((rest (recur (bitwise-and #xffff enc)))
  124. (second (bitwise-and #xffff (arithmetic-shift enc -16))))
  125. (if (zero? second)
  126. rest
  127. (append (recur second) rest))))
  128. ;; ... or a 1-element list or pair of scalar values
  129. ((null? (cdr enc))
  130. ;; 1 element
  131. (recur (car enc)))
  132. ;; 2 is max
  133. (else
  134. (append (recur (cdr enc))
  135. (recur (car enc)))))))
  136. (else
  137. (list (scalar-value->char sv))))))
  138. ; bubble-sort decompositions accoring to combining class
  139. ; returns the modified string
  140. (define (reorder-according-to-combining-class! s)
  141. (let ((size (string-length s)))
  142. (let repeat ()
  143. (let loop ((i 0)
  144. (swapped? #f))
  145. (cond
  146. ((< (+ i 1) size)
  147. (let ((sv-i (char->scalar-value (string-ref s i)))
  148. (sv-i+1 (char->scalar-value (string-ref s (+ i 1)))))
  149. (let ((cc-i (sv-combining-class sv-i))
  150. (cc-i+1 (sv-combining-class sv-i+1)))
  151. (if (and (not (zero? cc-i))
  152. (not (zero? cc-i+1))
  153. (< cc-i+1 cc-i))
  154. (begin
  155. (string-set! s i (scalar-value->char sv-i+1))
  156. (string-set! s (+ i 1) (scalar-value->char sv-i))
  157. (loop (+ 1 i) #t))
  158. (loop (+ 1 i) swapped?)))))
  159. (swapped? (repeat))
  160. (else s))))))
  161. (define (compose-2 sv-1 sv-2)
  162. (let ((encoding (bitwise-ior (arithmetic-shift sv-2 16)
  163. sv-1)))
  164. (cond
  165. ((binary-search *composition-encodings* encoding)
  166. => (lambda (index)
  167. (vector-ref *composition-scalar-values* index)))
  168. (else #f))))
  169. (define (compose! s)
  170. (let ((size (string-length s)))
  171. (let loop ((p 0) ; output index for finished combined character
  172. (p2 1) ; output index for uncombined characters
  173. (i 0) ; input index for starting character
  174. (j 1)) ; input index for characters to be combined
  175. (if (< i size)
  176. (let* ((sv-i (char->scalar-value (string-ref s i)))
  177. (cc-i (sv-combining-class sv-i)))
  178. (if (zero? cc-i)
  179. (if (= j size)
  180. (begin
  181. ;; we're done combining with sv-i; skip past
  182. ;; combining sequences in both input and output
  183. (string-set! s p (scalar-value->char sv-i))
  184. (substring s 0 (min size (max (+ p 1) p2))))
  185. (let* ((sv-j (char->scalar-value (string-ref s j)))
  186. (cc-j (sv-combining-class sv-j)))
  187. (cond
  188. ((and (= j (+ i 1))
  189. (sv-jamo-initial-consonant? sv-i)
  190. (sv-jamo-vowel? sv-j))
  191. ;; need Hangul composition
  192. (if (and (< (+ j 1) size)
  193. (sv-jamo-trailing-consonant?
  194. (char->scalar-value (string-ref s (+ j 1)))))
  195. ;; 3-char composition
  196. (let ((composite
  197. (+ jamo-syllable-start
  198. (* (- sv-i jamo-initial-consonant-start)
  199. jamo-syllable-per-count)
  200. (* (- sv-j jamo-vowel-start)
  201. jamo-trailing-consonant-count)
  202. (- (char->scalar-value (string-ref s (+ j 1)))
  203. jamo-trailing-consonant-start))))
  204. (string-set! s i (scalar-value->char composite))
  205. (loop p p2 i (+ j 2)))
  206. ;; 2-char composition
  207. (let ((composite
  208. (+ jamo-syllable-start
  209. (* (- sv-i jamo-initial-consonant-start)
  210. jamo-syllable-per-count)
  211. (* (- sv-j jamo-vowel-start)
  212. jamo-trailing-consonant-count))))
  213. (string-set! s i (scalar-value->char composite))
  214. (loop p p2 i (+ j 1)))))
  215. ((let ((previous-cc (sv-combining-class (char->scalar-value (string-ref s (- j 1))))))
  216. ;; check if blocked
  217. (and (<= previous-cc cc-j)
  218. (compose-2 sv-i sv-j)))
  219. ;; we can combine; store result temporarily at i;
  220. ;; advance past the combining mark
  221. => (lambda (combined)
  222. (string-set! s i (scalar-value->char combined))
  223. (loop p p2 i (+ j 1))))
  224. ((zero? cc-j)
  225. ;; both are combining class 0; we're done
  226. ;; combining with sv-i; skip past combining sequences
  227. ;; in both input and output
  228. (string-set! s p (scalar-value->char sv-i))
  229. (loop p2 (+ p2 1) j (+ 1 j)))
  230. (else
  231. (let skip ((j j) (p2 p2))
  232. (if (< j size)
  233. (let ((sv-j (char->scalar-value (string-ref s j))))
  234. (if (= (sv-combining-class sv-j) cc-j)
  235. (begin
  236. (string-set! s p2 (scalar-value->char sv-j))
  237. (skip (+ j 1) (+ p2 1)))
  238. (loop p p2 i j)))
  239. (loop p p2 i j)))))))
  240. (loop (+ p 1) (+ p2 1) (+ i 1) (+ j 1))))
  241. (substring s 0 (min size p2))))))
  242. (define (string-normalize-nfc s)
  243. (compose! (string-normalize-nfd s)))
  244. (define (string-normalize-nfkc s)
  245. (compose! (string-normalize-nfkd s)))