text-encoding.scm 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. ; -*- Mode: Scheme; Syntax: Scheme; Package: Scheme; -*-
  2. ; Copyright (c) 1993-2008 by Richard Kelsey and Jonathan Rees. See file COPYING.
  3. (define (encode-scalar-value encoding value buffer count)
  4. (let-syntax ((encode
  5. (syntax-rules ()
  6. ((encode ?encode-proc)
  7. (call-with-values
  8. (lambda ()
  9. (?encode-proc value buffer count))
  10. (lambda (encoding-ok? out-of-space? count)
  11. (values #t encoding-ok? out-of-space? count)))))))
  12. (enum-case
  13. text-encoding-option encoding
  14. ((us-ascii) (encode encode-scalar-value/us-ascii))
  15. ((latin-1) (encode encode-scalar-value/latin-1))
  16. ((utf-8) (encode encode-scalar-value/utf-8))
  17. ((utf-16le) (encode encode-scalar-value/utf-16le))
  18. ((utf-16be) (encode encode-scalar-value/utf-16be))
  19. ((utf-32le) (encode encode-scalar-value/utf-32le))
  20. ((utf-32be) (encode encode-scalar-value/utf-32be))
  21. (else
  22. (values #f #f #f 0)))))
  23. (define (decode-scalar-value encoding buffer count)
  24. (let-syntax ((decode
  25. (syntax-rules ()
  26. ((decode ?decode-proc)
  27. (call-with-values
  28. (lambda () (?decode-proc buffer count))
  29. (lambda (ok? incomplete? value count)
  30. (values #t ok? incomplete? value count)))))))
  31. (enum-case
  32. text-encoding-option encoding
  33. ((us-ascii) (decode decode-scalar-value/us-ascii))
  34. ((latin-1) (decode decode-scalar-value/latin-1))
  35. ((utf-8) (decode decode-scalar-value/utf-8))
  36. ((utf-16le) (decode decode-scalar-value/utf-16le))
  37. ((utf-16be) (decode decode-scalar-value/utf-16be))
  38. ((utf-32le) (decode decode-scalar-value/utf-32le))
  39. ((utf-32be) (decode decode-scalar-value/utf-32be))
  40. (else
  41. (values #f #f #f 0 0)))))
  42. ;; US-ASCII
  43. ;; This is mainly needed because it might be the default locale
  44. ;; encoding reported by the OS.
  45. (define (encode-scalar-value/us-ascii value buffer count)
  46. (cond
  47. ((< count 1)
  48. (values #t #t 1))
  49. ((< value 128)
  50. (buffer-set! buffer 0 value)
  51. (values #t #f 1))
  52. (else
  53. (values #f #f 0))))
  54. (define (decode-scalar-value/us-ascii buffer count)
  55. (values #t ; OK?
  56. #f ; incomplete?
  57. (buffer-ref buffer 0)
  58. 1))
  59. ; Latin-1
  60. (define (encode-scalar-value/latin-1 value buffer count)
  61. (cond
  62. ((< count 1)
  63. (values #t #t 1))
  64. ((< value 256)
  65. (buffer-set! buffer 0 value)
  66. (values #t #f 1))
  67. (else
  68. (values #f #f 0))))
  69. (define (decode-scalar-value/latin-1 buffer count)
  70. (values #t ; OK?
  71. #f ; incomplete?
  72. (buffer-ref buffer 0)
  73. 1))
  74. ; UTF-8
  75. (define (encode-scalar-value/utf-8 value buffer count)
  76. (cond
  77. ((<= value #x7f)
  78. (if (>= count 1)
  79. (begin
  80. (buffer-set! buffer 0 value)
  81. (values #t #f 1))
  82. (values #t #t 1)))
  83. ((<= value #x7ff)
  84. (if (>= count 2)
  85. (begin
  86. (buffer-set!
  87. buffer 0
  88. (+ #xc0
  89. (logical-shift-right (bitwise-and value #b11111000000)
  90. 6)))
  91. (buffer-set!
  92. buffer 1
  93. (+ #x80
  94. (bitwise-and value #b111111)))
  95. (values #t #f 2))
  96. (values #t #t 2)))
  97. ((<= value #xffff)
  98. (if (>= count 3)
  99. (begin
  100. (buffer-set!
  101. buffer 0
  102. (+ #xe0
  103. (logical-shift-right (bitwise-and value #b1111000000000000)
  104. 12)))
  105. (buffer-set!
  106. buffer 1
  107. (+ #x80
  108. (logical-shift-right (bitwise-and value #b111111000000)
  109. 6)))
  110. (buffer-set!
  111. buffer 2
  112. (+ #x80
  113. (bitwise-and value #b111111)))
  114. (values #t #f 3))
  115. (values #t #t 3)))
  116. (else
  117. (if (>= count 4)
  118. (begin
  119. (buffer-set!
  120. buffer 0
  121. (+ #xf0
  122. (logical-shift-right (bitwise-and value #b111000000000000000000)
  123. 18)))
  124. (buffer-set!
  125. buffer 1
  126. (+ #x80
  127. (logical-shift-right (bitwise-and value #b111111000000000000)
  128. 12)))
  129. (buffer-set!
  130. buffer 2
  131. (+ #x80
  132. (logical-shift-right (bitwise-and value #b111111000000)
  133. 6)))
  134. (buffer-set!
  135. buffer 3
  136. (+ #x80
  137. (bitwise-and value #b111111)))
  138. (values #t #f 4))
  139. (values #t #t 4)))))
  140. ; The table, and the associated decoding algorithm, is from
  141. ; Richard Gillam: "Unicode Demystified", chapter 14
  142. (define *utf-8-state-table*
  143. '#(;; state 0
  144. 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 1 1 1 1 2 2 3 -1
  145. ;; state 1
  146. -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 0 0 0 0 0 0 0 0 -2 -2 -2 -2 -2 -2 -2 -2
  147. ;; state 2
  148. -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 1 1 1 1 1 1 1 1 -2 -2 -2 -2 -2 -2 -2 -2
  149. ;; state 3
  150. -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 2 2 2 2 2 2 2 2 -2 -2 -2 -2 -2 -2 -2 -2))
  151. (define *utf-8-masks* '#(#x7f #x1f #x0f #x07))
  152. ; We don't check for non-shortest-form UTF-8. Too bad.
  153. (define (decode-scalar-value/utf-8 buffer count)
  154. (let loop ((q 0) (state 0) (mask 0) (scalar-value 0))
  155. (if (< q count)
  156. (let* ((c (buffer-ref buffer q))
  157. (state (vector-ref *utf-8-state-table*
  158. (+ (shift-left state 5) ; (* state 32)
  159. (arithmetic-shift-right c 3)))))
  160. (case state
  161. ((0)
  162. (let ((scalar-value (+ scalar-value
  163. (bitwise-and c #x7f))))
  164. (if (scalar-value? scalar-value)
  165. (values #t #f scalar-value (+ q 1))
  166. (values #f #f 0 0))))
  167. ((1 2 3)
  168. (loop (+ 1 q) state #x3f
  169. (shift-left (+ scalar-value
  170. (bitwise-and c
  171. (if (= 0 mask)
  172. (vector-ref *utf-8-masks* state)
  173. mask)))
  174. 6)))
  175. ((-2 -1)
  176. (values #f #f 0 0))
  177. (else ; this can't happen
  178. (values #f #f 0 0))))
  179. (values #t #t 0 (+ 1 q)))))
  180. ; UTF-16
  181. (define (buffer-set-word16/le! buffer index word)
  182. (buffer-set! buffer index
  183. (bitwise-and #b11111111 word))
  184. (buffer-set! buffer (+ index 1)
  185. (logical-shift-right word 8)))
  186. (define (buffer-set-word16/be! buffer index word)
  187. (buffer-set! buffer index
  188. (logical-shift-right word 8))
  189. (buffer-set! buffer (+ index 1)
  190. (bitwise-and #b11111111 word)))
  191. (define (make-encode-scalar-value/utf-16 buffer-set-word16!)
  192. (lambda (value buffer count)
  193. (if (<= value #xffff)
  194. (if (< count 2)
  195. (values #t #t 2)
  196. (begin
  197. (buffer-set-word16! buffer 0 value)
  198. (values #t #f 2)))
  199. (if (< count 4)
  200. (values #t #t 4)
  201. (begin
  202. (buffer-set-word16!
  203. buffer 0
  204. (+ (logical-shift-right value 10) #xd7c0))
  205. (buffer-set-word16!
  206. buffer 2
  207. (+ (bitwise-and value #x3ff) #xdc00))
  208. (values #t #f 4))))))
  209. (define encode-scalar-value/utf-16le
  210. (make-encode-scalar-value/utf-16 buffer-set-word16/le!))
  211. (define encode-scalar-value/utf-16be
  212. (make-encode-scalar-value/utf-16 buffer-set-word16/be!))
  213. (define (buffer-ref-word16/le codes index)
  214. (+ (buffer-ref codes index)
  215. (shift-left (buffer-ref codes (+ index 1)) 8)))
  216. (define (buffer-ref-word16/be codes index)
  217. (+ (shift-left (buffer-ref codes index) 8)
  218. (buffer-ref codes (+ index 1))))
  219. (define (make-decode-scalar-value/utf-16 buffer-ref-word16)
  220. (lambda (buffer count)
  221. (if (< count 2)
  222. (values #t #t 0 2)
  223. (let ((word0 (buffer-ref-word16 buffer 0)))
  224. (cond
  225. ((or (< word0 #xd800)
  226. (> word0 #xdfff))
  227. (values #t #f word0 2))
  228. ((< count 4)
  229. (values #t #t 0 4))
  230. ((<= word0 #xdbff)
  231. (let ((word1 (buffer-ref-word16 buffer 2 )))
  232. (if (and (>= word1 #xdc00)
  233. (<= word1 #xdfff))
  234. (values #t #f
  235. (+ (shift-left (- word0 #xd7c0) 10)
  236. (bitwise-and word1 #x3ff))
  237. 4)
  238. (values #f #f 0 0))))
  239. (else
  240. (values #f #f 0 0)))))))
  241. (define decode-scalar-value/utf-16le
  242. (make-decode-scalar-value/utf-16 buffer-ref-word16/le))
  243. (define decode-scalar-value/utf-16be
  244. (make-decode-scalar-value/utf-16 buffer-ref-word16/be))
  245. ; UTF-32
  246. (define (encode-scalar-value/utf-32le value buffer count)
  247. (if (< count 4)
  248. (values #t #t 4)
  249. (begin
  250. (buffer-set! buffer 0
  251. (bitwise-and value #xff))
  252. (buffer-set! buffer 1
  253. (logical-shift-right
  254. (bitwise-and value #xff00)
  255. 8))
  256. (buffer-set! buffer 2
  257. (logical-shift-right
  258. (bitwise-and value #xff0000)
  259. 16))
  260. (buffer-set! buffer 3
  261. (logical-shift-right value 24))
  262. (values #t #f 4))))
  263. (define (encode-scalar-value/utf-32be value buffer count)
  264. (if (< count 4)
  265. (values #t #t 4)
  266. (begin
  267. (buffer-set! buffer 0
  268. (logical-shift-right value 24))
  269. (buffer-set! buffer 1
  270. (logical-shift-right
  271. (bitwise-and value #xff0000)
  272. 16))
  273. (buffer-set! buffer 2
  274. (logical-shift-right
  275. (bitwise-and value #xff00)
  276. 8))
  277. (buffer-set! buffer 3
  278. (bitwise-and value #xff))
  279. (values #t #f 4))))
  280. (define (decode-scalar-value/utf-32le buffer count)
  281. (if (< count 4)
  282. (values #t #t 0 4)
  283. (let ((code-point
  284. (+ (buffer-ref buffer 0)
  285. (shift-left (buffer-ref buffer 1)
  286. 8)
  287. (shift-left (buffer-ref buffer 2)
  288. 16)
  289. (shift-left (buffer-ref buffer 3)
  290. 24))))
  291. (if (scalar-value? code-point)
  292. (values #t #f
  293. code-point
  294. 4)
  295. (values #f #f 0 0)))))
  296. (define (decode-scalar-value/utf-32be buffer count)
  297. (if (< count 4)
  298. (values #t #t 0 4)
  299. (let ((code-point
  300. (+ (shift-left (buffer-ref buffer 0)
  301. 24)
  302. (shift-left (buffer-ref buffer 1)
  303. 16)
  304. (shift-left
  305. (buffer-ref buffer 2)
  306. 8)
  307. (buffer-ref buffer 3))))
  308. (if (scalar-value? code-point)
  309. (values #t #f
  310. code-point
  311. 4)
  312. (values #f #f 0 0)))))
  313. ; Utilities
  314. (define (scalar-value? x)
  315. (and (>= x 0)
  316. (or (<= x #xd7ff)
  317. (and (>= x #xe000) (<= x #x10ffff)))))
  318. (define (buffer-ref b i)
  319. (unsigned-byte-ref (address+ b i)))
  320. (define (buffer-set! b i v)
  321. (unsigned-byte-set! (address+ b i) v))