utf8.js 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. import { inRange, decoderError, isASCIICodePoint,
  2. end_of_stream, finished } from './text_decoder_utils.js'
  3. /**
  4. * @implements {Decoder}
  5. */
  6. export class UTF8Decoder {
  7. /**
  8. * @param {{fatal: boolean}} options
  9. */
  10. constructor(options) {
  11. const { fatal } = options
  12. // utf-8's decoder's has an associated utf-8 code point, utf-8
  13. // bytes seen, and utf-8 bytes needed (all initially 0), a utf-8
  14. // lower boundary (initially 0x80), and a utf-8 upper boundary
  15. // (initially 0xBF).
  16. let /** @type {number} */ utf8_code_point = 0,
  17. /** @type {number} */ utf8_bytes_seen = 0,
  18. /** @type {number} */ utf8_bytes_needed = 0,
  19. /** @type {number} */ utf8_lower_boundary = 0x80,
  20. /** @type {number} */ utf8_upper_boundary = 0xBF
  21. /**
  22. * @param {Stream} stream The stream of bytes being decoded.
  23. * @param {number} bite The next byte read from the stream.
  24. * @return {?(number|!Array.<number>)} The next code point(s)
  25. * decoded, or null if not enough data exists in the input
  26. * stream to decode a complete code point.
  27. */
  28. this.handler = function(stream, bite) {
  29. // 1. If byte is end-of-stream and utf-8 bytes needed is not 0,
  30. // set utf-8 bytes needed to 0 and return error.
  31. if (bite === end_of_stream && utf8_bytes_needed !== 0) {
  32. utf8_bytes_needed = 0
  33. return decoderError(fatal)
  34. }
  35. // 2. If byte is end-of-stream, return finished.
  36. if (bite === end_of_stream)
  37. return finished
  38. // 3. If utf-8 bytes needed is 0, based on byte:
  39. if (utf8_bytes_needed === 0) {
  40. // 0x00 to 0x7F
  41. if (inRange(bite, 0x00, 0x7F)) {
  42. // Return a code point whose value is byte.
  43. return bite
  44. }
  45. // 0xC2 to 0xDF
  46. else if (inRange(bite, 0xC2, 0xDF)) {
  47. // 1. Set utf-8 bytes needed to 1.
  48. utf8_bytes_needed = 1
  49. // 2. Set UTF-8 code point to byte & 0x1F.
  50. utf8_code_point = bite & 0x1F
  51. }
  52. // 0xE0 to 0xEF
  53. else if (inRange(bite, 0xE0, 0xEF)) {
  54. // 1. If byte is 0xE0, set utf-8 lower boundary to 0xA0.
  55. if (bite === 0xE0)
  56. utf8_lower_boundary = 0xA0
  57. // 2. If byte is 0xED, set utf-8 upper boundary to 0x9F.
  58. if (bite === 0xED)
  59. utf8_upper_boundary = 0x9F
  60. // 3. Set utf-8 bytes needed to 2.
  61. utf8_bytes_needed = 2
  62. // 4. Set UTF-8 code point to byte & 0xF.
  63. utf8_code_point = bite & 0xF
  64. }
  65. // 0xF0 to 0xF4
  66. else if (inRange(bite, 0xF0, 0xF4)) {
  67. // 1. If byte is 0xF0, set utf-8 lower boundary to 0x90.
  68. if (bite === 0xF0)
  69. utf8_lower_boundary = 0x90
  70. // 2. If byte is 0xF4, set utf-8 upper boundary to 0x8F.
  71. if (bite === 0xF4)
  72. utf8_upper_boundary = 0x8F
  73. // 3. Set utf-8 bytes needed to 3.
  74. utf8_bytes_needed = 3
  75. // 4. Set UTF-8 code point to byte & 0x7.
  76. utf8_code_point = bite & 0x7
  77. }
  78. // Otherwise
  79. else {
  80. // Return error.
  81. return decoderError(fatal)
  82. }
  83. // Return continue.
  84. return null
  85. }
  86. // 4. If byte is not in the range utf-8 lower boundary to utf-8
  87. // upper boundary, inclusive, run these substeps:
  88. if (!inRange(bite, utf8_lower_boundary, utf8_upper_boundary)) {
  89. // 1. Set utf-8 code point, utf-8 bytes needed, and utf-8
  90. // bytes seen to 0, set utf-8 lower boundary to 0x80, and set
  91. // utf-8 upper boundary to 0xBF.
  92. utf8_code_point = utf8_bytes_needed = utf8_bytes_seen = 0
  93. utf8_lower_boundary = 0x80
  94. utf8_upper_boundary = 0xBF
  95. // 2. Prepend byte to stream.
  96. stream.prepend(bite)
  97. // 3. Return error.
  98. return decoderError(fatal)
  99. }
  100. // 5. Set utf-8 lower boundary to 0x80 and utf-8 upper boundary
  101. // to 0xBF.
  102. utf8_lower_boundary = 0x80
  103. utf8_upper_boundary = 0xBF
  104. // 6. Set UTF-8 code point to (UTF-8 code point << 6) | (byte &
  105. // 0x3F)
  106. utf8_code_point = (utf8_code_point << 6) | (bite & 0x3F)
  107. // 7. Increase utf-8 bytes seen by one.
  108. utf8_bytes_seen += 1
  109. // 8. If utf-8 bytes seen is not equal to utf-8 bytes needed,
  110. // continue.
  111. if (utf8_bytes_seen !== utf8_bytes_needed)
  112. return null
  113. // 9. Let code point be utf-8 code point.
  114. var code_point = utf8_code_point
  115. // 10. Set utf-8 code point, utf-8 bytes needed, and utf-8 bytes
  116. // seen to 0.
  117. utf8_code_point = utf8_bytes_needed = utf8_bytes_seen = 0
  118. // 11. Return a code point whose value is code point.
  119. return code_point
  120. }
  121. }
  122. }
  123. // 9.1.2 utf-8 encoder
  124. /**
  125. * @implements {Encoder}
  126. */
  127. export class UTF8Encoder {
  128. constructor() {
  129. /**
  130. * @param {Stream} stream Input stream.
  131. * @param {number} code_point Next code point read from the stream.
  132. * @return {(number|!Array.<number>)} Byte(s) to emit.
  133. */
  134. this.handler = function(stream, code_point) {
  135. // 1. If code point is end-of-stream, return finished.
  136. if (code_point === end_of_stream)
  137. return finished
  138. // 2. If code point is an ASCII code point, return a byte whose
  139. // value is code point.
  140. if (isASCIICodePoint(code_point))
  141. return code_point
  142. // 3. Set count and offset based on the range code point is in:
  143. var count, offset
  144. // U+0080 to U+07FF, inclusive:
  145. if (inRange(code_point, 0x0080, 0x07FF)) {
  146. // 1 and 0xC0
  147. count = 1
  148. offset = 0xC0
  149. }
  150. // U+0800 to U+FFFF, inclusive:
  151. else if (inRange(code_point, 0x0800, 0xFFFF)) {
  152. // 2 and 0xE0
  153. count = 2
  154. offset = 0xE0
  155. }
  156. // U+10000 to U+10FFFF, inclusive:
  157. else if (inRange(code_point, 0x10000, 0x10FFFF)) {
  158. // 3 and 0xF0
  159. count = 3
  160. offset = 0xF0
  161. }
  162. // 4. Let bytes be a byte sequence whose first byte is (code
  163. // point >> (6 × count)) + offset.
  164. var bytes = [(code_point >> (6 * count)) + offset]
  165. // 5. Run these substeps while count is greater than 0:
  166. while (count > 0) {
  167. // 1. Set temp to code point >> (6 × (count − 1)).
  168. var temp = code_point >> (6 * (count - 1))
  169. // 2. Append to bytes 0x80 | (temp & 0x3F).
  170. bytes.push(0x80 | (temp & 0x3F))
  171. // 3. Decrease count by one.
  172. count -= 1
  173. }
  174. // 6. Return bytes bytes, in order.
  175. return bytes
  176. }
  177. }
  178. }