nsUnicodeToGBK.cpp 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  2. /* This Source Code Form is subject to the terms of the Mozilla Public
  3. * License, v. 2.0. If a copy of the MPL was not distributed with this
  4. * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
  5. /**
  6. * A character set converter from Unicode to GBK.
  7. *
  8. *
  9. * @created 08/Sept/1999
  10. * @author Yueheng Xu, Yueheng.Xu@intel.com
  11. * Revision History
  12. * 04/Oct/1999. Yueheng Xu: used table gUnicodeToGBKTable[0x5200] to make
  13. * Unicode to GB mapping fast
  14. */
  15. #include "nsUnicodeToGBK.h"
  16. #include "gbku.h"
  17. #include "uconvutil.h"
  18. #include "nsCharTraits.h"
  19. #include "nsUnicodeEncodeHelper.h"
  20. //-------------------------------------------------------------
  21. // Global table initialization function defined in gbku.h
  22. //-------------------------------------------------------------
  23. static const uint16_t g_uf_gb18030_2bytes[] = {
  24. #include "gb18030uniq2b.uf"
  25. };
  26. static const uint16_t g_uf_gb18030_4bytes[] = {
  27. #include "gb180304bytes.uf"
  28. };
  29. static const uint16_t g_uf_gbk[] = {
  30. #include "gbkuniq.uf"
  31. };
  32. //-----------------------------------------------------------------------
  33. // nsUnicodeToGB18030
  34. //-----------------------------------------------------------------------
  35. nsresult nsUnicodeToGB18030::TryExtensionEncoder(char16_t aChar,
  36. char* aOut,
  37. int32_t *aOutLen)
  38. {
  39. int32_t len = 1;
  40. return nsUnicodeEncodeHelper::ConvertByTable(
  41. &aChar, &len, aOut, aOutLen, u2BytesCharset, nullptr,
  42. (uMappingTable*) &g_uf_gb18030_2bytes);
  43. }
  44. nsresult nsUnicodeToGB18030::Try4BytesEncoder(char16_t aChar,
  45. char* aOut,
  46. int32_t *aOutLen)
  47. {
  48. int32_t len = 1;
  49. nsresult res = nsUnicodeEncodeHelper::ConvertByTable(
  50. &aChar, &len, aOut, aOutLen, u4BytesGB18030Charset, nullptr,
  51. (uMappingTable*) &g_uf_gb18030_4bytes);
  52. MOZ_ASSERT((res != NS_OK) || ((1 == len) && (4 == *aOutLen)),
  53. "unexpect conversion length");
  54. return res;
  55. }
  56. nsresult nsUnicodeToGB18030::EncodeSurrogate(char16_t aSurrogateHigh,
  57. char16_t aSurrogateLow,
  58. char* aOut,
  59. int32_t aDestLength,
  60. int32_t aBufferLength)
  61. {
  62. if( NS_IS_HIGH_SURROGATE(aSurrogateHigh) &&
  63. NS_IS_LOW_SURROGATE(aSurrogateLow) )
  64. {
  65. // notice that idx does not include the 0x10000
  66. uint32_t idx = ((aSurrogateHigh - (char16_t)0xD800) << 10 ) |
  67. (aSurrogateLow - (char16_t) 0xDC00);
  68. if (aDestLength + 4 > aBufferLength) {
  69. return NS_OK_UENC_MOREOUTPUT;
  70. }
  71. unsigned char *out = (unsigned char*) aOut;
  72. // notice this is from 0x90 for supplementary planes
  73. out[0] = (idx / (10*126*10)) + 0x90;
  74. idx %= (10*126*10);
  75. out[1] = (idx / (10*126)) + 0x30;
  76. idx %= (10*126);
  77. out[2] = (idx / (10)) + 0x81;
  78. out[3] = (idx % 10) + 0x30;
  79. return NS_OK;
  80. }
  81. return NS_ERROR_UENC_NOMAPPING;
  82. }
  83. //----------------------------------------------------------------------
  84. // Class nsUnicodeToGBK [implementation]
  85. nsUnicodeToGBK::nsUnicodeToGBK(uint32_t aMaxLength) :
  86. nsEncoderSupport(aMaxLength), mSurrogateHigh(0)
  87. {
  88. }
  89. nsresult nsUnicodeToGBK::TryExtensionEncoder(char16_t aChar,
  90. char* aOut,
  91. int32_t *aOutLen)
  92. {
  93. int32_t len = 1;
  94. return nsUnicodeEncodeHelper::ConvertByTable(
  95. &aChar, &len, aOut, aOutLen, u1ByteCharset, nullptr,
  96. (uMappingTable*) &g_uf_gbk);
  97. }
  98. nsresult nsUnicodeToGBK::Try4BytesEncoder(char16_t aChar,
  99. char* aOut,
  100. int32_t *aOutLen)
  101. {
  102. return NS_ERROR_UENC_NOMAPPING;
  103. }
  104. nsresult nsUnicodeToGBK::EncodeSurrogate(char16_t aSurrogateHigh,
  105. char16_t aSurrogateLow,
  106. char* aOut,
  107. int32_t aDestLength,
  108. int32_t aBufferLength)
  109. {
  110. return NS_ERROR_UENC_NOMAPPING; // GBK cannot encode Surrogate, let the subclass encode it.
  111. }
  112. NS_IMETHODIMP nsUnicodeToGBK::ConvertNoBuffNoErr(const char16_t * aSrc,
  113. int32_t * aSrcLength,
  114. char * aDest,
  115. int32_t * aDestLength)
  116. {
  117. int32_t iSrcLength = 0;
  118. int32_t iDestLength = 0;
  119. char16_t unicode;
  120. nsresult res = NS_OK;
  121. while (iSrcLength < *aSrcLength )
  122. {
  123. unicode = *aSrc;
  124. //if unicode's hi byte has something, it is not ASCII, must be a GB
  125. if (IS_ASCII(unicode)) {
  126. // make sure we still have 1 byte for output first
  127. if (iDestLength >= *aDestLength) {
  128. res = NS_OK_UENC_MOREOUTPUT;
  129. break;
  130. }
  131. // this is an ASCII
  132. *aDest = CAST_UNICHAR_TO_CHAR(*aSrc);
  133. aDest++; // increment 1 byte
  134. iDestLength +=1;
  135. } else {
  136. char byte1, byte2;
  137. if(mUtil.UnicodeToGBKChar( unicode, false, &byte1, &byte2))
  138. {
  139. // make sure we still have 2 bytes for output first
  140. if(iDestLength+2 > *aDestLength)
  141. {
  142. res = NS_OK_UENC_MOREOUTPUT;
  143. break;
  144. }
  145. aDest[0] = byte1;
  146. aDest[1] = byte2;
  147. aDest += 2; // increment 2 bytes
  148. iDestLength +=2;
  149. } else {
  150. // Swapped character in GB18030-2005
  151. if (unicode == 0xE7C7) {
  152. unicode = 0x1E3F;
  153. }
  154. // we cannot map in the common mapping. Let's try to
  155. // call the delegated 2 byte converter for the gbk or gb18030
  156. // unique 2 byte mapping
  157. int32_t outLen = *aDestLength - iDestLength;
  158. if (NS_IS_HIGH_SURROGATE(unicode) ||
  159. NS_IS_LOW_SURROGATE(unicode)) {
  160. // performance tune for surrogate characters
  161. res = NS_ERROR_UENC_NOMAPPING;
  162. } else {
  163. res = TryExtensionEncoder(unicode, aDest, &outLen);
  164. }
  165. if (res == NS_OK) {
  166. iDestLength += outLen;
  167. aDest += outLen;
  168. } else if (res == NS_OK_UENC_MOREOUTPUT) {
  169. break;
  170. } else {
  171. // we still cannot map. Let's try to
  172. // call the delegated GB18030 4 byte converter
  173. if( NS_IS_HIGH_SURROGATE(unicode) )
  174. {
  175. if((iSrcLength+1) < *aSrcLength ) {
  176. res = EncodeSurrogate(aSrc[0],aSrc[1], aDest,
  177. iDestLength, *aDestLength);
  178. if (res == NS_OK) {
  179. // since we got a surrogate pair, we need to increment src.
  180. iSrcLength++ ;
  181. aSrc++;
  182. iDestLength += 4;
  183. aDest += 4;
  184. } else {
  185. if (res == NS_ERROR_UENC_NOMAPPING) {
  186. // only get a high surrogate, but not a low surrogate
  187. iSrcLength++; // include length of the unmapped character
  188. }
  189. break;
  190. }
  191. } else {
  192. mSurrogateHigh = aSrc[0];
  193. res = NS_OK;
  194. break; // this will go to afterwhileloop
  195. }
  196. } else {
  197. if( NS_IS_LOW_SURROGATE(unicode) )
  198. {
  199. if(NS_IS_HIGH_SURROGATE(mSurrogateHigh)) {
  200. res = EncodeSurrogate(mSurrogateHigh, aSrc[0], aDest,
  201. iDestLength, *aDestLength);
  202. if (res == NS_OK) {
  203. iDestLength += 4;
  204. aDest += 4;
  205. } else {
  206. if (res == NS_ERROR_UENC_NOMAPPING) {
  207. // only get a high surrogate, but not a low surrogate
  208. iSrcLength++; // include length of the unmapped character
  209. }
  210. break;
  211. }
  212. } else {
  213. // only get a low surrogate, but not a low surrogate
  214. res = NS_ERROR_UENC_NOMAPPING;
  215. iSrcLength++; // include length of the unmapped character
  216. break;
  217. }
  218. } else {
  219. outLen = *aDestLength - iDestLength;
  220. res = Try4BytesEncoder(unicode, aDest, &outLen);
  221. if (res == NS_OK) {
  222. iDestLength += outLen;
  223. aDest += outLen;
  224. } else {
  225. if (res == NS_ERROR_UENC_NOMAPPING) {
  226. iSrcLength++; // include length of the unmapped character
  227. }
  228. break;
  229. }
  230. }
  231. }
  232. }
  233. }
  234. }
  235. iSrcLength++ ; // Each unicode char just count as one in char16_t string;
  236. mSurrogateHigh = 0;
  237. aSrc++;
  238. }
  239. //afterwhileloop:
  240. *aDestLength = iDestLength;
  241. *aSrcLength = iSrcLength;
  242. return res;
  243. }