123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208 |
- import { inRange, decoderError, isASCIICodePoint,
- end_of_stream, finished } from './text_decoder_utils.js'
- /**
- * @implements {Decoder}
- */
- export class UTF8Decoder {
- /**
- * @param {{fatal: boolean}} options
- */
- constructor(options) {
- const { fatal } = options
- // utf-8's decoder's has an associated utf-8 code point, utf-8
- // bytes seen, and utf-8 bytes needed (all initially 0), a utf-8
- // lower boundary (initially 0x80), and a utf-8 upper boundary
- // (initially 0xBF).
- let /** @type {number} */ utf8_code_point = 0,
- /** @type {number} */ utf8_bytes_seen = 0,
- /** @type {number} */ utf8_bytes_needed = 0,
- /** @type {number} */ utf8_lower_boundary = 0x80,
- /** @type {number} */ utf8_upper_boundary = 0xBF
- /**
- * @param {Stream} stream The stream of bytes being decoded.
- * @param {number} bite The next byte read from the stream.
- * @return {?(number|!Array.<number>)} The next code point(s)
- * decoded, or null if not enough data exists in the input
- * stream to decode a complete code point.
- */
- this.handler = function(stream, bite) {
- // 1. If byte is end-of-stream and utf-8 bytes needed is not 0,
- // set utf-8 bytes needed to 0 and return error.
- if (bite === end_of_stream && utf8_bytes_needed !== 0) {
- utf8_bytes_needed = 0
- return decoderError(fatal)
- }
- // 2. If byte is end-of-stream, return finished.
- if (bite === end_of_stream)
- return finished
- // 3. If utf-8 bytes needed is 0, based on byte:
- if (utf8_bytes_needed === 0) {
- // 0x00 to 0x7F
- if (inRange(bite, 0x00, 0x7F)) {
- // Return a code point whose value is byte.
- return bite
- }
- // 0xC2 to 0xDF
- else if (inRange(bite, 0xC2, 0xDF)) {
- // 1. Set utf-8 bytes needed to 1.
- utf8_bytes_needed = 1
- // 2. Set UTF-8 code point to byte & 0x1F.
- utf8_code_point = bite & 0x1F
- }
- // 0xE0 to 0xEF
- else if (inRange(bite, 0xE0, 0xEF)) {
- // 1. If byte is 0xE0, set utf-8 lower boundary to 0xA0.
- if (bite === 0xE0)
- utf8_lower_boundary = 0xA0
- // 2. If byte is 0xED, set utf-8 upper boundary to 0x9F.
- if (bite === 0xED)
- utf8_upper_boundary = 0x9F
- // 3. Set utf-8 bytes needed to 2.
- utf8_bytes_needed = 2
- // 4. Set UTF-8 code point to byte & 0xF.
- utf8_code_point = bite & 0xF
- }
- // 0xF0 to 0xF4
- else if (inRange(bite, 0xF0, 0xF4)) {
- // 1. If byte is 0xF0, set utf-8 lower boundary to 0x90.
- if (bite === 0xF0)
- utf8_lower_boundary = 0x90
- // 2. If byte is 0xF4, set utf-8 upper boundary to 0x8F.
- if (bite === 0xF4)
- utf8_upper_boundary = 0x8F
- // 3. Set utf-8 bytes needed to 3.
- utf8_bytes_needed = 3
- // 4. Set UTF-8 code point to byte & 0x7.
- utf8_code_point = bite & 0x7
- }
- // Otherwise
- else {
- // Return error.
- return decoderError(fatal)
- }
- // Return continue.
- return null
- }
- // 4. If byte is not in the range utf-8 lower boundary to utf-8
- // upper boundary, inclusive, run these substeps:
- if (!inRange(bite, utf8_lower_boundary, utf8_upper_boundary)) {
- // 1. Set utf-8 code point, utf-8 bytes needed, and utf-8
- // bytes seen to 0, set utf-8 lower boundary to 0x80, and set
- // utf-8 upper boundary to 0xBF.
- utf8_code_point = utf8_bytes_needed = utf8_bytes_seen = 0
- utf8_lower_boundary = 0x80
- utf8_upper_boundary = 0xBF
- // 2. Prepend byte to stream.
- stream.prepend(bite)
- // 3. Return error.
- return decoderError(fatal)
- }
- // 5. Set utf-8 lower boundary to 0x80 and utf-8 upper boundary
- // to 0xBF.
- utf8_lower_boundary = 0x80
- utf8_upper_boundary = 0xBF
- // 6. Set UTF-8 code point to (UTF-8 code point << 6) | (byte &
- // 0x3F)
- utf8_code_point = (utf8_code_point << 6) | (bite & 0x3F)
- // 7. Increase utf-8 bytes seen by one.
- utf8_bytes_seen += 1
- // 8. If utf-8 bytes seen is not equal to utf-8 bytes needed,
- // continue.
- if (utf8_bytes_seen !== utf8_bytes_needed)
- return null
- // 9. Let code point be utf-8 code point.
- var code_point = utf8_code_point
- // 10. Set utf-8 code point, utf-8 bytes needed, and utf-8 bytes
- // seen to 0.
- utf8_code_point = utf8_bytes_needed = utf8_bytes_seen = 0
- // 11. Return a code point whose value is code point.
- return code_point
- }
- }
- }
- // 9.1.2 utf-8 encoder
- /**
- * @implements {Encoder}
- */
- export class UTF8Encoder {
- constructor() {
- /**
- * @param {Stream} stream Input stream.
- * @param {number} code_point Next code point read from the stream.
- * @return {(number|!Array.<number>)} Byte(s) to emit.
- */
- this.handler = function(stream, code_point) {
- // 1. If code point is end-of-stream, return finished.
- if (code_point === end_of_stream)
- return finished
- // 2. If code point is an ASCII code point, return a byte whose
- // value is code point.
- if (isASCIICodePoint(code_point))
- return code_point
- // 3. Set count and offset based on the range code point is in:
- var count, offset
- // U+0080 to U+07FF, inclusive:
- if (inRange(code_point, 0x0080, 0x07FF)) {
- // 1 and 0xC0
- count = 1
- offset = 0xC0
- }
- // U+0800 to U+FFFF, inclusive:
- else if (inRange(code_point, 0x0800, 0xFFFF)) {
- // 2 and 0xE0
- count = 2
- offset = 0xE0
- }
- // U+10000 to U+10FFFF, inclusive:
- else if (inRange(code_point, 0x10000, 0x10FFFF)) {
- // 3 and 0xF0
- count = 3
- offset = 0xF0
- }
- // 4. Let bytes be a byte sequence whose first byte is (code
- // point >> (6 × count)) + offset.
- var bytes = [(code_point >> (6 * count)) + offset]
- // 5. Run these substeps while count is greater than 0:
- while (count > 0) {
- // 1. Set temp to code point >> (6 × (count − 1)).
- var temp = code_point >> (6 * (count - 1))
- // 2. Append to bytes 0x80 | (temp & 0x3F).
- bytes.push(0x80 | (temp & 0x3F))
- // 3. Decrease count by one.
- count -= 1
- }
- // 6. Return bytes bytes, in order.
- return bytes
- }
- }
- }
|