nsGBKToUnicode.cpp 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  2. /* This Source Code Form is subject to the terms of the Mozilla Public
  3. * License, v. 2.0. If a copy of the MPL was not distributed with this
  4. * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
  5. /**
  6. * A character set converter from GBK to Unicode.
  7. *
  8. *
  9. * @created 07/Sept/1999
  10. * @author Yueheng Xu, Yueheng.Xu@intel.com
  11. */
  12. #include "nsGBKToUnicode.h"
  13. #include "gbku.h"
  14. #include "nsUnicodeDecodeHelper.h"
  15. static const uint16_t g_utGB18030Unique2Bytes[] = {
  16. #include "gb18030uniq2b.ut"
  17. };
  18. static const uint16_t g_utGB18030Unique4Bytes[] = {
  19. #include "gb180304bytes.ut"
  20. };
  21. //----------------------------------------------------------------------
  22. // Class nsGB18030ToUnicode [implementation]
  23. //----------------------------------------------------------------------
  24. // Subclassing of nsBufferDecoderSupport class [implementation]
  25. #define LEGAL_GBK_MULTIBYTE_FIRST_BYTE(c) \
  26. (UINT8_IN_RANGE(0x81, (c), 0xFE))
  27. #define FIRST_BYTE_IS_SURROGATE(c) \
  28. (UINT8_IN_RANGE(0x90, (c), 0xFE))
  29. #define LEGAL_GBK_2BYTE_SECOND_BYTE(c) \
  30. (UINT8_IN_RANGE(0x40, (c), 0x7E)|| UINT8_IN_RANGE(0x80, (c), 0xFE))
  31. #define LEGAL_GBK_4BYTE_SECOND_BYTE(c) \
  32. (UINT8_IN_RANGE(0x30, (c), 0x39))
  33. #define LEGAL_GBK_4BYTE_THIRD_BYTE(c) \
  34. (UINT8_IN_RANGE(0x81, (c), 0xFE))
  35. #define LEGAL_GBK_4BYTE_FORTH_BYTE(c) \
  36. (UINT8_IN_RANGE(0x30, (c), 0x39))
  37. NS_IMETHODIMP nsGB18030ToUnicode::ConvertNoBuff(const char* aSrc,
  38. int32_t * aSrcLength,
  39. char16_t *aDest,
  40. int32_t * aDestLength)
  41. {
  42. int32_t i=0;
  43. int32_t iSrcLength = (*aSrcLength);
  44. int32_t iDestlen = 0;
  45. nsresult rv=NS_OK;
  46. *aSrcLength = 0;
  47. for (i=0;i<iSrcLength;i++)
  48. {
  49. if ( iDestlen >= (*aDestLength) )
  50. {
  51. rv = NS_OK_UDEC_MOREOUTPUT;
  52. break;
  53. }
  54. // The valid range for the 1st byte is [0x81,0xFE]
  55. if(LEGAL_GBK_MULTIBYTE_FIRST_BYTE(*aSrc))
  56. {
  57. if(i+1 >= iSrcLength)
  58. {
  59. rv = NS_OK_UDEC_MOREINPUT;
  60. break;
  61. }
  62. // To make sure, the second byte has to be checked as well.
  63. // In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE]
  64. if(LEGAL_GBK_2BYTE_SECOND_BYTE(aSrc[1]))
  65. {
  66. // Valid GBK code
  67. *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
  68. if(UCS2_NO_MAPPING == *aDest)
  69. {
  70. // We cannot map in the common mapping, let's call the
  71. // delegate 2 byte decoder to decode the gbk or gb18030 unique
  72. // 2 byte mapping
  73. if(! TryExtensionDecoder(aSrc, aDest))
  74. {
  75. *aDest = UCS2_NO_MAPPING;
  76. }
  77. }
  78. aSrc += 2;
  79. i++;
  80. }
  81. else if (LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
  82. {
  83. // from the first 2 bytes, it looks like a 4 byte GB18030
  84. if(i+3 >= iSrcLength) // make sure we got 4 bytes
  85. {
  86. rv = NS_OK_UDEC_MOREINPUT;
  87. break;
  88. }
  89. // 4 bytes patten
  90. // [0x81-0xfe][0x30-0x39][0x81-0xfe][0x30-0x39]
  91. // preset the
  92. if (LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]) &&
  93. LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
  94. {
  95. if ( ! FIRST_BYTE_IS_SURROGATE(aSrc[0]))
  96. {
  97. // let's call the delegated 4 byte gb18030 converter to convert it
  98. if (!Try4BytesDecoder(aSrc, aDest)) {
  99. *aDest = UCS2_NO_MAPPING;
  100. }
  101. // Swapped character in GB18030-2005
  102. if (*aDest == 0x1E3F) {
  103. *aDest = 0xE7C7;
  104. }
  105. } else {
  106. // let's try supplement mapping
  107. if ( (iDestlen+1) < (*aDestLength) )
  108. {
  109. if(DecodeToSurrogate(aSrc, aDest))
  110. {
  111. // surrogte two char16_t
  112. iDestlen++;
  113. aDest++;
  114. } else {
  115. *aDest = UCS2_NO_MAPPING;
  116. }
  117. } else {
  118. if (*aDestLength < 2) {
  119. NS_ERROR("insufficient space in output buffer");
  120. *aDest = UCS2_NO_MAPPING;
  121. } else {
  122. rv = NS_OK_UDEC_MOREOUTPUT;
  123. break;
  124. }
  125. }
  126. }
  127. aSrc += 4;
  128. i += 3;
  129. } else {
  130. *aDest = UCS2_NO_MAPPING;
  131. // If the third and fourth bytes are not in the legal ranges for
  132. // a four-byte sequnce, resynchronize on the second byte
  133. // (which we know is in the range of LEGAL_GBK_4BYTE_SECOND_BYTE,
  134. // 0x30-0x39)
  135. aSrc++;
  136. }
  137. }
  138. else if ((uint8_t) aSrc[0] == (uint8_t)0xA0 )
  139. {
  140. // stand-alone (not followed by a valid second byte) 0xA0 !
  141. // treat it as valid a la Netscape 4.x
  142. *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
  143. aSrc++;
  144. } else {
  145. // Invalid GBK code point (second byte should be 0x40 or higher)
  146. *aDest = UCS2_NO_MAPPING;
  147. aSrc++;
  148. }
  149. } else {
  150. if(IS_ASCII(*aSrc))
  151. {
  152. // The source is an ASCII
  153. *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
  154. aSrc++;
  155. } else {
  156. if(IS_GBK_EURO(*aSrc)) {
  157. *aDest = UCS2_EURO;
  158. } else {
  159. *aDest = UCS2_NO_MAPPING;
  160. }
  161. aSrc++;
  162. }
  163. }
  164. iDestlen++;
  165. aDest++;
  166. *aSrcLength = i+1;
  167. }
  168. *aDestLength = iDestlen;
  169. return rv;
  170. }
  171. bool nsGB18030ToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut)
  172. {
  173. NS_ASSERTION(FIRST_BYTE_IS_SURROGATE(aSrc[0]), "illegal first byte");
  174. NS_ASSERTION(LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]), "illegal second byte");
  175. NS_ASSERTION(LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]), "illegal third byte");
  176. NS_ASSERTION(LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]), "illegal forth byte");
  177. if(! FIRST_BYTE_IS_SURROGATE(aSrc[0]))
  178. return false;
  179. if(! LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
  180. return false;
  181. if(! LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]))
  182. return false;
  183. if(! LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
  184. return false;
  185. uint8_t a1 = (uint8_t) aSrc[0];
  186. uint8_t a2 = (uint8_t) aSrc[1];
  187. uint8_t a3 = (uint8_t) aSrc[2];
  188. uint8_t a4 = (uint8_t) aSrc[3];
  189. a1 -= (uint8_t)0x90;
  190. a2 -= (uint8_t)0x30;
  191. a3 -= (uint8_t)0x81;
  192. a4 -= (uint8_t)0x30;
  193. uint32_t idx = (((a1 * 10 + a2 ) * 126 + a3) * 10) + a4;
  194. // idx == ucs4Codepoint - 0x10000
  195. if (idx > 0x000FFFFF)
  196. return false;
  197. *aOut++ = 0xD800 | (idx >> 10);
  198. *aOut = 0xDC00 | (0x000003FF & idx);
  199. return true;
  200. }
  201. bool nsGB18030ToUnicode::TryExtensionDecoder(const char* aSrc, char16_t* aOut)
  202. {
  203. int32_t len = 2;
  204. int32_t dstlen = 1;
  205. nsresult res =
  206. nsUnicodeDecodeHelper::ConvertByTable(aSrc, &len, aOut, &dstlen,
  207. u2BytesCharset, nullptr,
  208. (uMappingTable*) &g_utGB18030Unique2Bytes,
  209. false);
  210. NS_ASSERTION(NS_FAILED(res) || ((len==2) && (dstlen == 1)),
  211. "some strange conversion result");
  212. // if we failed, we then just use the 0xfffd
  213. // therefore, we ignore the res here.
  214. return NS_SUCCEEDED(res);
  215. }
  216. bool nsGB18030ToUnicode::Try4BytesDecoder(const char* aSrc, char16_t* aOut)
  217. {
  218. int32_t len = 4;
  219. int32_t dstlen = 1;
  220. nsresult res =
  221. nsUnicodeDecodeHelper::ConvertByTable(aSrc, &len, aOut, &dstlen,
  222. u4BytesGB18030Charset, nullptr,
  223. (uMappingTable*) &g_utGB18030Unique4Bytes,
  224. false);
  225. NS_ASSERTION(NS_FAILED(res) || ((len==4) && (dstlen == 1)),
  226. "some strange conversion result");
  227. // if we failed, we then just use the 0xfffd
  228. // therefore, we ignore the res here.
  229. return NS_SUCCEEDED(res);
  230. }