utf16.js 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. import { inRange, decoderError, end_of_stream, finished, convertCodeUnitToBytes } from './text_decoder_utils.js'
  2. // 15.2.1 shared utf-16 decoder
  3. /**
  4. * @implements {Decoder}
  5. */
  6. export class UTF16Decoder {
  7. /**
  8. * @param {boolean} utf16_be True if big-endian, false if little-endian.
  9. * @param {{fatal: boolean}} options
  10. */
  11. constructor(utf16_be, options) {
  12. const { fatal } = options
  13. this.utf16_be = utf16_be
  14. this.fatal = fatal
  15. this.utf16_lead_byte = null
  16. this.utf16_lead_surrogate = null
  17. }
  18. /**
  19. * @param {Stream} stream The stream of bytes being decoded.
  20. * @param {number} bite The next byte read from the stream.
  21. */
  22. handler(stream, bite) {
  23. // 1. If byte is end-of-stream and either utf-16 lead byte or
  24. // utf-16 lead surrogate is not null, set utf-16 lead byte and
  25. // utf-16 lead surrogate to null, and return error.
  26. if (bite === end_of_stream && (this.utf16_lead_byte !== null ||
  27. this.utf16_lead_surrogate !== null)) {
  28. return decoderError(this.fatal)
  29. }
  30. // 2. If byte is end-of-stream and utf-16 lead byte and utf-16
  31. // lead surrogate are null, return finished.
  32. if (bite === end_of_stream && this.utf16_lead_byte === null &&
  33. this.utf16_lead_surrogate === null) {
  34. return finished
  35. }
  36. // 3. If utf-16 lead byte is null, set utf-16 lead byte to byte
  37. // and return continue.
  38. if (this.utf16_lead_byte === null) {
  39. this.utf16_lead_byte = bite
  40. return null
  41. }
  42. // 4. Let code unit be the result of:
  43. let code_unit
  44. if (this.utf16_be) {
  45. // utf-16be decoder flag is set
  46. // (utf-16 lead byte << 8) + byte.
  47. code_unit = (this.utf16_lead_byte << 8) + bite
  48. } else {
  49. // utf-16be decoder flag is unset
  50. // (byte << 8) + utf-16 lead byte.
  51. code_unit = (bite << 8) + this.utf16_lead_byte
  52. }
  53. // Then set utf-16 lead byte to null.
  54. this.utf16_lead_byte = null
  55. // 5. If utf-16 lead surrogate is not null, let lead surrogate
  56. // be utf-16 lead surrogate, set utf-16 lead surrogate to null,
  57. // and then run these substeps:
  58. if (this.utf16_lead_surrogate !== null) {
  59. const lead_surrogate = this.utf16_lead_surrogate
  60. this.utf16_lead_surrogate = null
  61. // 1. If code unit is in the range U+DC00 to U+DFFF,
  62. // inclusive, return a code point whose value is 0x10000 +
  63. // ((lead surrogate − 0xD800) << 10) + (code unit − 0xDC00).
  64. if (inRange(code_unit, 0xDC00, 0xDFFF)) {
  65. return 0x10000 + (lead_surrogate - 0xD800) * 0x400 +
  66. (code_unit - 0xDC00)
  67. }
  68. // 2. Prepend the sequence resulting of converting code unit
  69. // to bytes using utf-16be decoder flag to stream and return
  70. // error.
  71. stream.prepend(convertCodeUnitToBytes(code_unit, this.utf16_be))
  72. return decoderError(this.fatal)
  73. }
  74. // 6. If code unit is in the range U+D800 to U+DBFF, inclusive,
  75. // set utf-16 lead surrogate to code unit and return continue.
  76. if (inRange(code_unit, 0xD800, 0xDBFF)) {
  77. this.utf16_lead_surrogate = code_unit
  78. return null
  79. }
  80. // 7. If code unit is in the range U+DC00 to U+DFFF, inclusive,
  81. // return error.
  82. if (inRange(code_unit, 0xDC00, 0xDFFF))
  83. return decoderError(this.fatal)
  84. // 8. Return code point code unit.
  85. return code_unit
  86. }
  87. }
  88. // 15.2.2 shared utf-16 encoder
  89. /**
  90. * @implements {Encoder}
  91. */
  92. export class UTF16Encoder {
  93. /**
  94. * @param {boolean} [utf16_be] True if big-endian, false if little-endian.
  95. */
  96. constructor(utf16_be = false) {
  97. this.utf16_be = utf16_be
  98. }
  99. /**
  100. * @param {Stream} stream Input stream.
  101. * @param {number} code_point Next code point read from the stream.
  102. */
  103. handler(stream, code_point) {
  104. // 1. If code point is end-of-stream, return finished.
  105. if (code_point === end_of_stream)
  106. return finished
  107. // 2. If code point is in the range U+0000 to U+FFFF, inclusive,
  108. // return the sequence resulting of converting code point to
  109. // bytes using utf-16be encoder flag.
  110. if (inRange(code_point, 0x0000, 0xFFFF))
  111. return convertCodeUnitToBytes(code_point, this.utf16_be)
  112. // 3. Let lead be ((code point − 0x10000) >> 10) + 0xD800,
  113. // converted to bytes using utf-16be encoder flag.
  114. const lead = convertCodeUnitToBytes(
  115. ((code_point - 0x10000) >> 10) + 0xD800, this.utf16_be)
  116. // 4. Let trail be ((code point − 0x10000) & 0x3FF) + 0xDC00,
  117. // converted to bytes using utf-16be encoder flag.
  118. const trail = convertCodeUnitToBytes(
  119. ((code_point - 0x10000) & 0x3FF) + 0xDC00, this.utf16_be)
  120. // 5. Return a byte sequence of lead followed by trail.
  121. return lead.concat(trail)
  122. }
  123. }