unicode-charmap.scm 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. ; Part of Scheme 48 1.9. See file COPYING for notices and license.
  2. ; Authors: Mike Sperber
  3. ; Copyright (c) 2005-2006 by Basis Technology Corporation.
  4. ; Unicode version of env/rts/charmap.scm, replaces relevant procedures there.
  5. (define *encoding-table-block-mask*
  6. (- (arithmetic-shift 1 *encoding-table-block-bits*) 1))
  7. ; access the compact table
  8. (define (char-info-encoding c)
  9. (let ((scalar-value (char->scalar-value c)))
  10. (vector-ref *scalar-value-info-encodings*
  11. (+ (vector-ref *scalar-value-info-indices*
  12. (arithmetic-shift scalar-value (- *encoding-table-block-bits*)))
  13. (bitwise-and scalar-value
  14. *encoding-table-block-mask*)))))
  15. (define *scalar-value-encoding-general-category-mask*
  16. (- (arithmetic-shift 1 *general-category-bits*) 1))
  17. (define (char-general-category c)
  18. (vector-ref general-categories
  19. (bitwise-and (char-info-encoding c)
  20. *scalar-value-encoding-general-category-mask*)))
  21. (define (unicode-char-alphabetic? c)
  22. (eq? (primary-category letter)
  23. (general-category-primary-category (char-general-category c))))
  24. (define (unicode-char-numeric? c)
  25. (eq? (primary-category number)
  26. (general-category-primary-category (char-general-category c))))
  27. (define (unicode-char-whitespace? c)
  28. (or (eq? (general-category space-separator)
  29. (char-general-category c))
  30. (let ((scalar-value (char->scalar-value c)))
  31. ;; space, horizontal tab, line feed (= newline), vertical tab,
  32. ;; form feed, and carriage return
  33. (and (>= scalar-value 9)
  34. (<= scalar-value 13)))))
  35. (define (unicode-char-lower-case? c)
  36. (let ((encoding (char-info-encoding c)))
  37. (not
  38. (zero?
  39. (bitwise-and 1
  40. (arithmetic-shift encoding
  41. (- (+ *uppercase-index-width*
  42. *lowercase-index-width*
  43. *titlecase-index-width*
  44. *general-category-bits*))))))))
  45. (define (unicode-char-upper-case? c)
  46. (let ((encoding (char-info-encoding c)))
  47. (not
  48. (zero?
  49. (bitwise-and 1
  50. (arithmetic-shift encoding
  51. (- (+ 1
  52. *uppercase-index-width*
  53. *lowercase-index-width*
  54. *titlecase-index-width*
  55. *general-category-bits*))))))))
  56. (define (lookup-by-offset-index scalar-value offset-index offsets)
  57. (scalar-value->char
  58. (+ scalar-value (vector-ref offsets offset-index))))
  59. (define *uppercase-mask* (- (arithmetic-shift 1 *uppercase-index-width*) 1))
  60. (define (unicode-char-upcase c)
  61. (let ((scalar-value (char->scalar-value c))
  62. (encoding (char-info-encoding c)))
  63. (lookup-by-offset-index
  64. scalar-value
  65. (bitwise-and *uppercase-mask*
  66. (arithmetic-shift encoding
  67. (- (+ *lowercase-index-width*
  68. *titlecase-index-width*
  69. *general-category-bits*))))
  70. *uppercase-offsets*)))
  71. (define *lowercase-mask* (- (arithmetic-shift 1 *lowercase-index-width*) 1))
  72. (define (unicode-char-downcase c)
  73. (let ((scalar-value (char->scalar-value c))
  74. (encoding (char-info-encoding c)))
  75. (lookup-by-offset-index
  76. scalar-value
  77. (bitwise-and *lowercase-mask*
  78. (arithmetic-shift encoding
  79. (- (+ *titlecase-index-width* *general-category-bits*))))
  80. *lowercase-offsets*)))
  81. (define (char-foldcase c)
  82. (case (char->scalar-value c)
  83. ((#x130 #x131) ; Turkish 0 and 1
  84. c)
  85. (else
  86. (char-downcase (char-upcase c)))))
  87. ; Now replace the ASCII-only procedures by these
  88. (set-char-map-procedures! unicode-char-alphabetic?
  89. unicode-char-numeric?
  90. unicode-char-whitespace?
  91. unicode-char-upper-case?
  92. unicode-char-lower-case?
  93. unicode-char-upcase
  94. unicode-char-downcase
  95. char-foldcase)
  96. ; Unicode bonus material
  97. (define (char-title-case? c)
  98. (eq? (general-category titlecase-letter)
  99. (char-general-category c)))
  100. (define *titlecase-mask* (- (arithmetic-shift 1 *titlecase-index-width*) 1))
  101. (define (char-titlecase c)
  102. (let ((scalar-value (char->scalar-value c))
  103. (encoding (char-info-encoding c)))
  104. (lookup-by-offset-index
  105. scalar-value
  106. (bitwise-and *titlecase-mask*
  107. (arithmetic-shift encoding (- *general-category-bits*)))
  108. *titlecase-offsets*)))
  109. ; check if the mapping in UnicodeDate.txt is not authoritative, and we
  110. ; should use the one in SpecialCasing.txt
  111. (define (unicode-char-specialcasing? c)
  112. (let ((encoding (char-info-encoding c)))
  113. (not
  114. (zero?
  115. (bitwise-and 1
  116. (arithmetic-shift encoding
  117. (- (+ 2
  118. *uppercase-index-width*
  119. *lowercase-index-width*
  120. *titlecase-index-width*
  121. *general-category-bits*))))))))
  122. (define (prepend-specialcasing-reverse start length c r)
  123. (let loop ((j 0)
  124. (r r))
  125. (if (>= j length)
  126. r
  127. (loop (+ j 1)
  128. (cons (string-ref *specialcasings* (+ start j))
  129. r)))))
  130. (define (string-xcase char-xcase prepend-specialcasing-xcase/reverse
  131. s)
  132. (let ((size (string-length s)))
  133. (let loop ((i 0) (r '()))
  134. (if (>= i size)
  135. (list->string (reverse r))
  136. (let ((c (string-ref s i)))
  137. (loop (+ 1 i)
  138. (if (unicode-char-specialcasing? c)
  139. (prepend-specialcasing-xcase/reverse c r s i size)
  140. (cons (char-xcase c) r))))))))
  141. (define (prepend-specialcasing-upcase/reverse c r s i size)
  142. (let ((specialcasing
  143. (table-ref *specialcasing-table* (char->scalar-value c))))
  144. (prepend-specialcasing-reverse (specialcasing-uppercase-start specialcasing)
  145. (specialcasing-uppercase-length specialcasing)
  146. c r)))
  147. (define (string-upcase s)
  148. (string-xcase char-upcase prepend-specialcasing-upcase/reverse s))
  149. (define (prepend-specialcasing-downcase/reverse c r s i size)
  150. (let ((specialcasing
  151. (table-ref *specialcasing-table* (char->scalar-value c))))
  152. (if (and (specialcasing-final-sigma? specialcasing)
  153. (or (and (< (+ 1 i) size) ; a letter follows
  154. (unicode-char-alphabetic? (string-ref s (+ 1 i))))
  155. (or (zero? i) ; it's the only letter in the word
  156. (not (unicode-char-alphabetic? (string-ref s (- i 1)))))))
  157. (cons (char-downcase c) r)
  158. (prepend-specialcasing-reverse (specialcasing-lowercase-start specialcasing)
  159. (specialcasing-lowercase-length specialcasing)
  160. c r))))
  161. (define (string-downcase s)
  162. (string-xcase char-downcase prepend-specialcasing-downcase/reverse s))
  163. (define (prepend-specialcasing-foldcase/reverse c r s i size)
  164. (let ((specialcasing
  165. (table-ref *specialcasing-table* (char->scalar-value c))))
  166. (prepend-specialcasing-reverse (specialcasing-foldcase-start specialcasing)
  167. (specialcasing-foldcase-length specialcasing)
  168. c r)))
  169. (define (string-foldcase s)
  170. (string-xcase char-foldcase prepend-specialcasing-foldcase/reverse s))
  171. (define (string-ci-comparator cs-comp)
  172. (lambda (a-string b-string)
  173. (cs-comp (string-foldcase a-string) (string-foldcase b-string))))
  174. (define string-ci=? (string-ci-comparator string=?))
  175. (define string-ci<? (string-ci-comparator string<?))
  176. (set-string-ci-procedures! string-ci=? string-ci<?)
  177. ; Titlecase
  178. (define (char-cased? c)
  179. (or (char-lower-case? c)
  180. (char-upper-case? c)
  181. (char-title-case? c)))
  182. (define u+00ad (scalar-value->char #x00ad)) ; SOFT HYPHEN (SHY)
  183. (define u+2019 (scalar-value->char #x2019)) ; RIGHT SINGLE QUOTATION MARK
  184. (define (char-case-ignorable? c)
  185. ;; Mike suspects this list is not complete
  186. (or (char=? c #\')
  187. (char=? c u+00ad)
  188. (char=? c u+2019)
  189. (let ((cat (char-general-category c)))
  190. (or (eq? cat (general-category non-spacing-mark))
  191. (eq? cat (general-category enclosing-mark))
  192. (eq? cat (general-category formatting-character))
  193. (eq? cat (general-category modifier-symbol))))))
  194. (define (string-titlecase s)
  195. (let ((size (string-length s)))
  196. (let loop ((i 0) (r '()))
  197. ;; looking for a letter
  198. (if (>= i size)
  199. (list->string (reverse r))
  200. (let ((c (string-ref s i)))
  201. (if (char-cased? c)
  202. (let casing-loop
  203. ((j (+ 1 i))
  204. (r (if (unicode-char-specialcasing? c)
  205. (let ((specialcasing
  206. (table-ref *specialcasing-table* (char->scalar-value c))))
  207. (prepend-specialcasing-reverse
  208. (specialcasing-titlecase-start specialcasing)
  209. (specialcasing-titlecase-length specialcasing)
  210. c r))
  211. (cons (char-titlecase c) r))))
  212. (if (>= j size)
  213. (list->string (reverse r))
  214. (let ((c (string-ref s j)))
  215. (cond
  216. ((char-case-ignorable? c)
  217. (casing-loop (+ j 1) (cons c r)))
  218. ((char-cased? c)
  219. (casing-loop (+ j 1)
  220. (if (unicode-char-specialcasing? c)
  221. (prepend-specialcasing-downcase/reverse c r s j size)
  222. (cons (char-downcase c) r))))
  223. (else
  224. (loop j r))))))
  225. (loop (+ 1 i) (cons c r))))))))