123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245 |
- /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
- /* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
- /**
- * A character set converter from GBK to Unicode.
- *
- *
- * @created 07/Sept/1999
- * @author Yueheng Xu, Yueheng.Xu@intel.com
- */
- #include "nsGBKToUnicode.h"
- #include "gbku.h"
- #include "nsUnicodeDecodeHelper.h"
- static const uint16_t g_utGB18030Unique2Bytes[] = {
- #include "gb18030uniq2b.ut"
- };
- static const uint16_t g_utGB18030Unique4Bytes[] = {
- #include "gb180304bytes.ut"
- };
- //----------------------------------------------------------------------
- // Class nsGB18030ToUnicode [implementation]
- //----------------------------------------------------------------------
- // Subclassing of nsBufferDecoderSupport class [implementation]
- #define LEGAL_GBK_MULTIBYTE_FIRST_BYTE(c) \
- (UINT8_IN_RANGE(0x81, (c), 0xFE))
- #define FIRST_BYTE_IS_SURROGATE(c) \
- (UINT8_IN_RANGE(0x90, (c), 0xFE))
- #define LEGAL_GBK_2BYTE_SECOND_BYTE(c) \
- (UINT8_IN_RANGE(0x40, (c), 0x7E)|| UINT8_IN_RANGE(0x80, (c), 0xFE))
- #define LEGAL_GBK_4BYTE_SECOND_BYTE(c) \
- (UINT8_IN_RANGE(0x30, (c), 0x39))
- #define LEGAL_GBK_4BYTE_THIRD_BYTE(c) \
- (UINT8_IN_RANGE(0x81, (c), 0xFE))
- #define LEGAL_GBK_4BYTE_FORTH_BYTE(c) \
- (UINT8_IN_RANGE(0x30, (c), 0x39))
- NS_IMETHODIMP nsGB18030ToUnicode::ConvertNoBuff(const char* aSrc,
- int32_t * aSrcLength,
- char16_t *aDest,
- int32_t * aDestLength)
- {
- int32_t i=0;
- int32_t iSrcLength = (*aSrcLength);
- int32_t iDestlen = 0;
- nsresult rv=NS_OK;
- *aSrcLength = 0;
-
- for (i=0;i<iSrcLength;i++)
- {
- if ( iDestlen >= (*aDestLength) )
- {
- rv = NS_OK_UDEC_MOREOUTPUT;
- break;
- }
- // The valid range for the 1st byte is [0x81,0xFE]
- if(LEGAL_GBK_MULTIBYTE_FIRST_BYTE(*aSrc))
- {
- if(i+1 >= iSrcLength)
- {
- rv = NS_OK_UDEC_MOREINPUT;
- break;
- }
- // To make sure, the second byte has to be checked as well.
- // In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE]
- if(LEGAL_GBK_2BYTE_SECOND_BYTE(aSrc[1]))
- {
- // Valid GBK code
- *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
- if(UCS2_NO_MAPPING == *aDest)
- {
- // We cannot map in the common mapping, let's call the
- // delegate 2 byte decoder to decode the gbk or gb18030 unique
- // 2 byte mapping
- if(! TryExtensionDecoder(aSrc, aDest))
- {
- *aDest = UCS2_NO_MAPPING;
- }
- }
- aSrc += 2;
- i++;
- }
- else if (LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
- {
- // from the first 2 bytes, it looks like a 4 byte GB18030
- if(i+3 >= iSrcLength) // make sure we got 4 bytes
- {
- rv = NS_OK_UDEC_MOREINPUT;
- break;
- }
- // 4 bytes patten
- // [0x81-0xfe][0x30-0x39][0x81-0xfe][0x30-0x39]
- // preset the
-
- if (LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]) &&
- LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
- {
- if ( ! FIRST_BYTE_IS_SURROGATE(aSrc[0]))
- {
- // let's call the delegated 4 byte gb18030 converter to convert it
- if (!Try4BytesDecoder(aSrc, aDest)) {
- *aDest = UCS2_NO_MAPPING;
- }
- // Swapped character in GB18030-2005
- if (*aDest == 0x1E3F) {
- *aDest = 0xE7C7;
- }
- } else {
- // let's try supplement mapping
- if ( (iDestlen+1) < (*aDestLength) )
- {
- if(DecodeToSurrogate(aSrc, aDest))
- {
- // surrogte two char16_t
- iDestlen++;
- aDest++;
- } else {
- *aDest = UCS2_NO_MAPPING;
- }
- } else {
- if (*aDestLength < 2) {
- NS_ERROR("insufficient space in output buffer");
- *aDest = UCS2_NO_MAPPING;
- } else {
- rv = NS_OK_UDEC_MOREOUTPUT;
- break;
- }
- }
- }
- aSrc += 4;
- i += 3;
- } else {
- *aDest = UCS2_NO_MAPPING;
- // If the third and fourth bytes are not in the legal ranges for
- // a four-byte sequnce, resynchronize on the second byte
- // (which we know is in the range of LEGAL_GBK_4BYTE_SECOND_BYTE,
- // 0x30-0x39)
- aSrc++;
- }
- }
- else if ((uint8_t) aSrc[0] == (uint8_t)0xA0 )
- {
- // stand-alone (not followed by a valid second byte) 0xA0 !
- // treat it as valid a la Netscape 4.x
- *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
- aSrc++;
- } else {
- // Invalid GBK code point (second byte should be 0x40 or higher)
- *aDest = UCS2_NO_MAPPING;
- aSrc++;
- }
- } else {
- if(IS_ASCII(*aSrc))
- {
- // The source is an ASCII
- *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
- aSrc++;
- } else {
- if(IS_GBK_EURO(*aSrc)) {
- *aDest = UCS2_EURO;
- } else {
- *aDest = UCS2_NO_MAPPING;
- }
- aSrc++;
- }
- }
- iDestlen++;
- aDest++;
- *aSrcLength = i+1;
- }
- *aDestLength = iDestlen;
- return rv;
- }
- bool nsGB18030ToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut)
- {
- NS_ASSERTION(FIRST_BYTE_IS_SURROGATE(aSrc[0]), "illegal first byte");
- NS_ASSERTION(LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]), "illegal second byte");
- NS_ASSERTION(LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]), "illegal third byte");
- NS_ASSERTION(LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]), "illegal forth byte");
- if(! FIRST_BYTE_IS_SURROGATE(aSrc[0]))
- return false;
- if(! LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
- return false;
- if(! LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]))
- return false;
- if(! LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
- return false;
- uint8_t a1 = (uint8_t) aSrc[0];
- uint8_t a2 = (uint8_t) aSrc[1];
- uint8_t a3 = (uint8_t) aSrc[2];
- uint8_t a4 = (uint8_t) aSrc[3];
- a1 -= (uint8_t)0x90;
- a2 -= (uint8_t)0x30;
- a3 -= (uint8_t)0x81;
- a4 -= (uint8_t)0x30;
- uint32_t idx = (((a1 * 10 + a2 ) * 126 + a3) * 10) + a4;
- // idx == ucs4Codepoint - 0x10000
- if (idx > 0x000FFFFF)
- return false;
- *aOut++ = 0xD800 | (idx >> 10);
- *aOut = 0xDC00 | (0x000003FF & idx);
- return true;
- }
- bool nsGB18030ToUnicode::TryExtensionDecoder(const char* aSrc, char16_t* aOut)
- {
- int32_t len = 2;
- int32_t dstlen = 1;
- nsresult res =
- nsUnicodeDecodeHelper::ConvertByTable(aSrc, &len, aOut, &dstlen,
- u2BytesCharset, nullptr,
- (uMappingTable*) &g_utGB18030Unique2Bytes,
- false);
- NS_ASSERTION(NS_FAILED(res) || ((len==2) && (dstlen == 1)),
- "some strange conversion result");
- // if we failed, we then just use the 0xfffd
- // therefore, we ignore the res here.
- return NS_SUCCEEDED(res);
- }
- bool nsGB18030ToUnicode::Try4BytesDecoder(const char* aSrc, char16_t* aOut)
- {
- int32_t len = 4;
- int32_t dstlen = 1;
- nsresult res =
- nsUnicodeDecodeHelper::ConvertByTable(aSrc, &len, aOut, &dstlen,
- u4BytesGB18030Charset, nullptr,
- (uMappingTable*) &g_utGB18030Unique4Bytes,
- false);
- NS_ASSERTION(NS_FAILED(res) || ((len==4) && (dstlen == 1)),
- "some strange conversion result");
- // if we failed, we then just use the 0xfffd
- // therefore, we ignore the res here.
- return NS_SUCCEEDED(res);
- }
|