encoding.scm 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. ; Copyright (c) 1993-2008 by Richard Kelsey and Jonathan Rees. See file COPYING.
  2. ; Character/string encodings
  3. ; We abstract over the primitive encode-char/decode-char characters to
  4. ; get two sets of procedures, one going through the general
  5. ; text-encoding infrastructure, and the other making use of the VM
  6. ; instructions.
  7. (define-syntax define-coding-procs
  8. (syntax-rules ()
  9. ((define-coding-procs
  10. (do-encode-char do-decode-char)
  11. char-encoding-length
  12. string-encoding-length
  13. encode-char
  14. encode-string
  15. string->bytes-n
  16. string->bytes
  17. decode-char
  18. bytes-string-size
  19. decode-string
  20. bytes->string
  21. bytes->string-n)
  22. (begin
  23. (define (char-encoding-length enc c)
  24. (call-with-values
  25. (lambda ()
  26. (do-encode-char enc c empty-buffer 0 0))
  27. (lambda (ok? count)
  28. count)))
  29. (define (string-encoding-length enc s start-index count)
  30. (let loop ((enc-length 0)
  31. (char-index 0))
  32. (if (>= char-index count)
  33. enc-length
  34. (loop (+ enc-length
  35. (char-encoding-length enc (string-ref s (+ start-index char-index))))
  36. (+ 1 char-index)))))
  37. ; returns byte count of the encoding
  38. (define (encode-char enc c target target-start)
  39. (call-with-values
  40. (lambda ()
  41. (do-encode-char enc
  42. c target target-start
  43. (- (byte-vector-length target) target-start)))
  44. (lambda (ok? count)
  45. count)))
  46. ; Will only produce complete encodings
  47. ; returns three values:
  48. ; - encoding status
  49. ; - # characters consumed
  50. ; - # bytes decoded
  51. (define (encode-string enc source source-start source-count
  52. target target-start target-count)
  53. (let loop ((source-index 0)
  54. (target-index 0))
  55. (cond
  56. ((>= source-index source-count)
  57. (values (enum encoding-status complete)
  58. source-index
  59. target-index))
  60. ((>= target-index target-count)
  61. (values (enum encoding-status insufficient)
  62. source-index
  63. target-index))
  64. (else
  65. (let ((c (string-ref source (+ source-start source-index))))
  66. (call-with-values
  67. (lambda ()
  68. (do-encode-char enc
  69. c
  70. target (+ target-start target-index)
  71. (max 0 (- target-count target-index))))
  72. (lambda (ok? count)
  73. (if (not ok?)
  74. (values (enum encoding-status insufficient)
  75. source-index
  76. target-index)
  77. (loop (+ source-index 1) (+ target-index count))))))))))
  78. (define (string->bytes-n enc s start count)
  79. (let* ((size (string-encoding-length enc s 0 count))
  80. (result (make-byte-vector size 0)))
  81. (encode-string enc s 0 count result 0 size)
  82. result))
  83. (define (string->bytes enc s)
  84. (string->bytes-n enc s 0 (string-length s)))
  85. ; Decoding
  86. ; Returns three values:
  87. ; - decoding status
  88. ; - character if status is COMPLETE, else #f
  89. ; - # bytes consumed if COMPLETE or INCOMPLETE, else #f
  90. (define (decode-char enc bytes start-index count)
  91. (call-with-values
  92. (lambda ()
  93. (do-decode-char enc
  94. bytes start-index
  95. count))
  96. (lambda (maybe-char count)
  97. (cond
  98. (maybe-char
  99. (values (enum decoding-status complete)
  100. maybe-char
  101. count))
  102. (count
  103. (values (enum decoding-status incomplete)
  104. #f
  105. count))
  106. (else
  107. (values (enum decoding-status invalid)
  108. #f
  109. #f))))))
  110. ; If STOP-AT-INVALID? is #f, we'll skip an invalid byte, and pretend
  111. ; it generated one character.
  112. ; Returns three values:
  113. ; - :DECODING-STATUS object
  114. ; - # bytes consumed
  115. ; - # characters decoded
  116. (define (bytes-string-size enc bytes start count stop-at-invalid?)
  117. (let loop ((index 0)
  118. (target-index 0))
  119. (if (>= index count)
  120. (values (enum decoding-status complete)
  121. index target-index)
  122. (call-with-values
  123. (lambda ()
  124. (do-decode-char enc
  125. bytes
  126. (+ start index)
  127. (- count index)))
  128. (lambda (char count)
  129. (cond
  130. (char
  131. (loop (+ index count) (+ target-index 1)))
  132. (count
  133. (values (enum decoding-status incomplete)
  134. index target-index))
  135. (stop-at-invalid?
  136. (values (enum decoding-status invalid)
  137. index target-index))
  138. (else
  139. (loop (+ 1 index) (+ 1 target-index)))))))))
  140. ; Returns three values:
  141. ; - :DECODING-STATUS object
  142. ; - # bytes consumed
  143. ; - # characters decoded
  144. (define (decode-string enc
  145. bytes start count
  146. target target-start target-count
  147. maybe-error-char)
  148. (let loop ((index 0)
  149. (target-index 0))
  150. (cond
  151. ((>= index count)
  152. (values (enum decoding-status complete)
  153. index
  154. target-index))
  155. ((>= target-index target-count)
  156. (values (enum decoding-status insufficient)
  157. index target-index))
  158. (else
  159. (call-with-values
  160. (lambda ()
  161. (do-decode-char enc
  162. bytes
  163. (+ start index)
  164. (- count index)))
  165. (lambda (char count)
  166. (cond
  167. (char
  168. (string-set! target (+ target-start target-index) char)
  169. (loop (+ index count) (+ target-index 1)))
  170. (count
  171. (values (enum decoding-status incomplete)
  172. index target-index))
  173. (maybe-error-char
  174. (string-set! target (+ target-start target-index) maybe-error-char)
  175. (loop (+ 1 index) (+ 1 target-index)))
  176. (else
  177. (values (enum decoding-status invalid)
  178. index target-index)))))))))
  179. ; may be slightly faster because of REVERSE-LIST->STRING
  180. ; If MAYBE-ERROR-CHAR is #f, we'll raise an error upon an invalid encoding
  181. ; If it's a character, it will be used at invalid *and incomplete* encodings
  182. (define (bytes->string enc source maybe-error-char)
  183. (bytes->string-n enc source 0 (byte-vector-length source) maybe-error-char))
  184. (define (bytes->string-n enc source start source-count maybe-error-char)
  185. (let loop ((rev-chars '())
  186. (char-count 0)
  187. (source-index 0))
  188. (if (>= source-index source-count)
  189. (reverse-list->string rev-chars char-count)
  190. (call-with-values
  191. (lambda ()
  192. (do-decode-char enc
  193. source
  194. (+ start source-index)
  195. (- source-count source-index)))
  196. (lambda (char count)
  197. (cond
  198. (char
  199. (loop (cons char rev-chars)
  200. (+ 1 char-count)
  201. (+ count source-index)))
  202. (maybe-error-char
  203. (loop (cons maybe-error-char rev-chars)
  204. (+ 1 char-count)
  205. (+ 1 source-index)))
  206. (count
  207. (decoding-error enc ; ####
  208. "incomplete encoding"
  209. source (+ start source-index)))
  210. (else
  211. (decoding-error enc ; ####
  212. "invalid encoding"
  213. source (+ start source-index)))))))))
  214. ))))
  215. (define-coding-procs (primitive-encode-char primitive-decode-char)
  216. char-encoding-length/encoding
  217. string-encoding-length/encoding
  218. encode-char/encoding
  219. encode-string/encoding
  220. string->bytes-n/encoding
  221. string->bytes/encoding
  222. decode-char/encoding
  223. bytes-string-size/encoding
  224. decode-string/encoding
  225. bytes->string/encoding
  226. bytes->string-n/encoding)
  227. (define-syntax primitive-encode-char/text-codec
  228. (syntax-rules ()
  229. ((encode-char/text-codec enc ch buffer start count)
  230. (atomically
  231. ((text-codec-encode-char-proc enc) ch buffer start count)))))
  232. (define-syntax primitive-decode-char/text-codec
  233. (syntax-rules ()
  234. ((decode-char/text-codec enc buffer start count)
  235. (atomically
  236. ((text-codec-decode-char-proc enc) buffer start count)))))
  237. (define-coding-procs (primitive-encode-char/text-codec primitive-decode-char/text-codec)
  238. char-encoding-length/text-codec
  239. string-encoding-length/text-codec
  240. encode-char/text-codec
  241. encode-string/text-codec
  242. string->bytes-n/text-codec
  243. string->bytes/text-codec
  244. decode-char/text-codec
  245. bytes-string-size/text-codec
  246. decode-string/text-codec
  247. bytes->string/text-codec
  248. bytes->string-n/text-codec)
  249. (define-syntax define-text-codec-proc
  250. (syntax-rules ()
  251. ((define-text-codec-proc (?name ?arg ...) ?name/codec ?name/encoding)
  252. (define (?name codec ?arg ...)
  253. (let ((spec (text-codec->spec codec)))
  254. (if (text-codec? spec)
  255. (?name/codec spec ?arg ...)
  256. (?name/encoding spec ?arg ...)))))))
  257. (define-text-codec-proc (char-encoding-length c)
  258. char-encoding-length/text-codec char-encoding-length/encoding)
  259. (define-text-codec-proc (string-encoding-length s start-index count)
  260. string-encoding-length/text-codec string-encoding-length/encoding)
  261. (define-text-codec-proc (encode-char c target target-start)
  262. encode-char/text-codec encode-char/encoding)
  263. (define-text-codec-proc (encode-string source source-start source-count
  264. target target-start target-count)
  265. encode-string/text-codec encode-string/encoding)
  266. (define-text-codec-proc (string->bytes-n s start count)
  267. string->bytes-n/text-codec string->bytes-n/encoding)
  268. (define-text-codec-proc (string->bytes s)
  269. string->bytes/text-codec string->bytes/encoding)
  270. (define-text-codec-proc (decode-char bytes start-index count)
  271. decode-char/text-codec decode-char/encoding)
  272. (define-text-codec-proc (bytes-string-size bytes start count stop-at-invalid?)
  273. bytes-string-size/text-codec bytes-string-size/encoding)
  274. (define-text-codec-proc (decode-string bytes start count
  275. target target-start target-count
  276. maybe-error-char)
  277. decode-string/text-codec decode-string/encoding)
  278. (define-text-codec-proc (bytes->string source maybe-error-char)
  279. bytes->string/text-codec bytes->string/encoding)
  280. (define-text-codec-proc (bytes->string-n source start source-count maybe-error-char)
  281. bytes->string-n/text-codec bytes->string-n/encoding)
  282. ;; Utilities
  283. (define empty-buffer (make-byte-vector 0 0))
  284. (define-enumeration encoding-status
  285. (complete insufficient))
  286. (define (decoding-error encoding-name
  287. message
  288. bytes start)
  289. (signal-condition
  290. (make-decoding-error encoding-name
  291. (string-append "error while decoding " encoding-name ": " message)
  292. bytes start)))
  293. (define-enumeration decoding-status
  294. (complete incomplete insufficient invalid))
  295. ;; UTF-8
  296. (define (char-encoding-length/utf-8 c)
  297. (char-encoding-length/encoding (enum text-encoding-option utf-8) c))
  298. (define (string-encoding-length/utf-8 s start-index count)
  299. (string-encoding-length/encoding (enum text-encoding-option utf-8)
  300. s start-index count))
  301. (define (encode-char/utf-8 c target target-start)
  302. (encode-char/encoding (enum text-encoding-option utf-8) c target target-start))
  303. (define (encode-string/utf-8 source source-start source-count
  304. target target-start target-count)
  305. (encode-string/encoding (enum text-encoding-option utf-8)
  306. source source-start source-count
  307. target target-start target-count))
  308. (define (string->utf-8-n s start count)
  309. (string->bytes-n/encoding (enum text-encoding-option utf-8) s start count))
  310. (define (string->utf-8 s)
  311. (string->bytes/encoding (enum text-encoding-option utf-8) s))
  312. (define (decode-char/utf-8 bytes start-index count)
  313. (decode-char/encoding (enum text-encoding-option utf-8) bytes start-index count))
  314. (define (bytes-string-size/utf-8 bytes start count stop-at-invalid?)
  315. (bytes-string-size/encoding (enum text-encoding-option utf-8)
  316. bytes start count stop-at-invalid?))
  317. (define (decode-string/utf-8 bytes start count
  318. target target-start target-count
  319. maybe-error-char)
  320. (decode-string/encoding (enum text-encoding-option utf-8)
  321. bytes start count
  322. target target-start target-count
  323. maybe-error-char))
  324. (define (utf-8->string source maybe-error-char)
  325. (bytes->string/encoding (enum text-encoding-option utf-8)
  326. source maybe-error-char))
  327. (define (utf-8->string-n source start source-count maybe-error-char)
  328. (bytes->string-n/encoding (enum text-encoding-option utf-8)
  329. source start source-count maybe-error-char))