unicode-charmap.scm 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. ; Copyright (c) 1993-2008 by Richard Kelsey and Jonathan Rees. See file COPYING.
  2. ; Copyright (c) 2005-2006 by Basis Technology Corporation. See file COPYING.
  3. ; Unicode version of env/rts/charmap.scm, replaces relevant procedures there.
  4. (define *encoding-table-block-mask*
  5. (- (arithmetic-shift 1 *encoding-table-block-bits*) 1))
  6. ; access the compact table
  7. (define (char-info-encoding c)
  8. (let ((scalar-value (char->scalar-value c)))
  9. (vector-ref *scalar-value-info-encodings*
  10. (+ (vector-ref *scalar-value-info-indices*
  11. (arithmetic-shift scalar-value (- *encoding-table-block-bits*)))
  12. (bitwise-and scalar-value
  13. *encoding-table-block-mask*)))))
  14. (define *scalar-value-encoding-general-category-mask*
  15. (- (arithmetic-shift 1 *general-category-bits*) 1))
  16. (define (char-general-category c)
  17. (vector-ref general-categories
  18. (bitwise-and (char-info-encoding c)
  19. *scalar-value-encoding-general-category-mask*)))
  20. (define (unicode-char-alphabetic? c)
  21. (eq? (primary-category letter)
  22. (general-category-primary-category (char-general-category c))))
  23. (define (unicode-char-numeric? c)
  24. (eq? (primary-category number)
  25. (general-category-primary-category (char-general-category c))))
  26. (define (unicode-char-whitespace? c)
  27. (or (eq? (general-category space-separator)
  28. (char-general-category c))
  29. (let ((scalar-value (char->scalar-value c)))
  30. ;; space, horizontal tab, line feed (= newline), vertical tab,
  31. ;; form feed, and carriage return
  32. (and (>= scalar-value 9)
  33. (<= scalar-value 13)))))
  34. (define (unicode-char-lower-case? c)
  35. (let ((encoding (char-info-encoding c)))
  36. (not
  37. (zero?
  38. (bitwise-and 1
  39. (arithmetic-shift encoding
  40. (- (+ *uppercase-index-width*
  41. *lowercase-index-width*
  42. *titlecase-index-width*
  43. *general-category-bits*))))))))
  44. (define (unicode-char-upper-case? c)
  45. (let ((encoding (char-info-encoding c)))
  46. (not
  47. (zero?
  48. (bitwise-and 1
  49. (arithmetic-shift encoding
  50. (- (+ 1
  51. *uppercase-index-width*
  52. *lowercase-index-width*
  53. *titlecase-index-width*
  54. *general-category-bits*))))))))
  55. (define (lookup-by-offset-index scalar-value offset-index offsets)
  56. (scalar-value->char
  57. (+ scalar-value (vector-ref offsets offset-index))))
  58. (define *uppercase-mask* (- (arithmetic-shift 1 *uppercase-index-width*) 1))
  59. (define (unicode-char-upcase c)
  60. (let ((scalar-value (char->scalar-value c))
  61. (encoding (char-info-encoding c)))
  62. (lookup-by-offset-index
  63. scalar-value
  64. (bitwise-and *uppercase-mask*
  65. (arithmetic-shift encoding
  66. (- (+ *lowercase-index-width*
  67. *titlecase-index-width*
  68. *general-category-bits*))))
  69. *uppercase-offsets*)))
  70. (define *lowercase-mask* (- (arithmetic-shift 1 *lowercase-index-width*) 1))
  71. (define (unicode-char-downcase c)
  72. (let ((scalar-value (char->scalar-value c))
  73. (encoding (char-info-encoding c)))
  74. (lookup-by-offset-index
  75. scalar-value
  76. (bitwise-and *lowercase-mask*
  77. (arithmetic-shift encoding
  78. (- (+ *titlecase-index-width* *general-category-bits*))))
  79. *lowercase-offsets*)))
  80. (define (char-foldcase c)
  81. (case (char->scalar-value c)
  82. ((#x130 #x131) ; Turkish 0 and 1
  83. c)
  84. (else
  85. (char-downcase (char-upcase c)))))
  86. ; Now replace the ASCII-only procedures by these
  87. (set-char-map-procedures! unicode-char-alphabetic?
  88. unicode-char-numeric?
  89. unicode-char-whitespace?
  90. unicode-char-upper-case?
  91. unicode-char-lower-case?
  92. unicode-char-upcase
  93. unicode-char-downcase
  94. char-foldcase)
  95. ; Unicode bonus material
  96. (define (char-title-case? c)
  97. (eq? (general-category titlecase-letter)
  98. (char-general-category c)))
  99. (define *titlecase-mask* (- (arithmetic-shift 1 *titlecase-index-width*) 1))
  100. (define (char-titlecase c)
  101. (let ((scalar-value (char->scalar-value c))
  102. (encoding (char-info-encoding c)))
  103. (lookup-by-offset-index
  104. scalar-value
  105. (bitwise-and *titlecase-mask*
  106. (arithmetic-shift encoding (- *general-category-bits*)))
  107. *titlecase-offsets*)))
  108. ; check if the mapping in UnicodeDate.txt is not authoritative, and we
  109. ; should use the one in SpecialCasing.txt
  110. (define (unicode-char-specialcasing? c)
  111. (let ((encoding (char-info-encoding c)))
  112. (not
  113. (zero?
  114. (bitwise-and 1
  115. (arithmetic-shift encoding
  116. (- (+ 2
  117. *uppercase-index-width*
  118. *lowercase-index-width*
  119. *titlecase-index-width*
  120. *general-category-bits*))))))))
  121. (define (prepend-specialcasing-reverse start length c r)
  122. (let loop ((j 0)
  123. (r r))
  124. (if (>= j length)
  125. r
  126. (loop (+ j 1)
  127. (cons (string-ref *specialcasings* (+ start j))
  128. r)))))
  129. (define (string-xcase char-xcase prepend-specialcasing-xcase/reverse
  130. s)
  131. (let ((size (string-length s)))
  132. (let loop ((i 0) (r '()))
  133. (if (>= i size)
  134. (list->string (reverse r))
  135. (let ((c (string-ref s i)))
  136. (loop (+ 1 i)
  137. (if (unicode-char-specialcasing? c)
  138. (prepend-specialcasing-xcase/reverse c r s i size)
  139. (cons (char-xcase c) r))))))))
  140. (define (prepend-specialcasing-upcase/reverse c r s i size)
  141. (let ((specialcasing
  142. (table-ref *specialcasing-table* (char->scalar-value c))))
  143. (prepend-specialcasing-reverse (specialcasing-uppercase-start specialcasing)
  144. (specialcasing-uppercase-length specialcasing)
  145. c r)))
  146. (define (string-upcase s)
  147. (string-xcase char-upcase prepend-specialcasing-upcase/reverse s))
  148. (define (prepend-specialcasing-downcase/reverse c r s i size)
  149. (let ((specialcasing
  150. (table-ref *specialcasing-table* (char->scalar-value c))))
  151. (if (and (specialcasing-final-sigma? specialcasing)
  152. (or (and (< (+ 1 i) size) ; a letter follows
  153. (unicode-char-alphabetic? (string-ref s (+ 1 i))))
  154. (or (zero? i) ; it's the only letter in the word
  155. (not (unicode-char-alphabetic? (string-ref s (- i 1)))))))
  156. (cons (char-downcase c) r)
  157. (prepend-specialcasing-reverse (specialcasing-lowercase-start specialcasing)
  158. (specialcasing-lowercase-length specialcasing)
  159. c r))))
  160. (define (string-downcase s)
  161. (string-xcase char-downcase prepend-specialcasing-downcase/reverse s))
  162. (define (prepend-specialcasing-foldcase/reverse c r s i size)
  163. (let ((specialcasing
  164. (table-ref *specialcasing-table* (char->scalar-value c))))
  165. (prepend-specialcasing-reverse (specialcasing-foldcase-start specialcasing)
  166. (specialcasing-foldcase-length specialcasing)
  167. c r)))
  168. (define (string-foldcase s)
  169. (string-xcase char-foldcase prepend-specialcasing-foldcase/reverse s))
  170. (define (string-ci-comparator cs-comp)
  171. (lambda (a-string b-string)
  172. (cs-comp (string-foldcase a-string) (string-foldcase b-string))))
  173. (define string-ci=? (string-ci-comparator string=?))
  174. (define string-ci<? (string-ci-comparator string<?))
  175. (set-string-ci-procedures! string-ci=? string-ci<?)
  176. ; Titlecase
  177. (define (char-cased? c)
  178. (or (char-lower-case? c)
  179. (char-upper-case? c)
  180. (char-title-case? c)))
  181. (define u+00ad (scalar-value->char #x00ad)) ; SOFT HYPHEN (SHY)
  182. (define u+2019 (scalar-value->char #x2019)) ; RIGHT SINGLE QUOTATION MARK
  183. (define (char-case-ignorable? c)
  184. ;; Mike suspects this list is not complete
  185. (or (char=? c #\')
  186. (char=? c u+00ad)
  187. (char=? c u+2019)
  188. (let ((cat (char-general-category c)))
  189. (or (eq? cat (general-category non-spacing-mark))
  190. (eq? cat (general-category enclosing-mark))
  191. (eq? cat (general-category formatting-character))
  192. (eq? cat (general-category modifier-symbol))))))
  193. (define (string-titlecase s)
  194. (let ((size (string-length s)))
  195. (let loop ((i 0) (r '()))
  196. ;; looking for a letter
  197. (if (>= i size)
  198. (list->string (reverse r))
  199. (let ((c (string-ref s i)))
  200. (if (char-cased? c)
  201. (let casing-loop
  202. ((j (+ 1 i))
  203. (r (if (unicode-char-specialcasing? c)
  204. (let ((specialcasing
  205. (table-ref *specialcasing-table* (char->scalar-value c))))
  206. (prepend-specialcasing-reverse
  207. (specialcasing-titlecase-start specialcasing)
  208. (specialcasing-titlecase-length specialcasing)
  209. c r))
  210. (cons (char-titlecase c) r))))
  211. (if (>= j size)
  212. (list->string (reverse r))
  213. (let ((c (string-ref s j)))
  214. (cond
  215. ((char-case-ignorable? c)
  216. (casing-loop (+ j 1) (cons c r)))
  217. ((char-cased? c)
  218. (casing-loop (+ j 1)
  219. (if (unicode-char-specialcasing? c)
  220. (prepend-specialcasing-downcase/reverse c r s j size)
  221. (cons (char-downcase c) r))))
  222. (else
  223. (loop j r))))))
  224. (loop (+ 1 i) (cons c r))))))))