123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347 |
- #include <string.h>
- #include <stdlib.h>
- #include <stdio.h>
- #if defined(__APPLE__)
- #include <CoreFoundation/CoreFoundation.h>
- #elif defined(_WIN32)
- #include <windows.h>
- #else
- #include <iconv.h>
- #include <errno.h>
- #include <stdint.h>
- #endif
- #include "stringEncodingConverter.h"
- char *convertShiftJISToUTF8(const char *in, size_t inLength)
- {
- #if defined(__APPLE__)
- CFStringRef inStr = CFStringCreateWithBytes(
- NULL,
- (const UInt8 *)in,
- inLength,
- kCFStringEncodingDOSJapanese,
- false
- );
- if (!inStr) return NULL;
- CFIndex outLength;
- CFRange range = CFRangeMake(0, CFStringGetLength(inStr));
- CFStringGetBytes(inStr, range, kCFStringEncodingUTF8, '?', false, NULL, 0, &outLength);
- char *out = calloc(outLength+1, 1);
- CFStringGetBytes(inStr, range, kCFStringEncodingUTF8, '?', false, (UInt8 *)out, outLength, NULL);
- CFRelease(inStr);
- return out;
- #elif defined(_WIN32)
- int wideLength = MultiByteToWideChar(932, 0, in, inLength, NULL, 0);
- if (wideLength == 0) return NULL;
- LPWSTR inWideStr = (LPWSTR)malloc(wideLength * sizeof(WCHAR));
- wideLength = MultiByteToWideChar(932, 0, in, inLength, inWideStr, wideLength);
- int outLength = WideCharToMultiByte(CP_UTF8, 0, inWideStr, wideLength, NULL, 0, NULL, NULL);
- char *out = calloc(outLength+1, 1);
- WideCharToMultiByte(CP_UTF8, 0, inWideStr, wideLength, out, outLength, NULL, NULL);
- free(inWideStr);
- return out;
- #else
- size_t inBytesLeft, outBytesLeft;
- inBytesLeft = inLength;
- outBytesLeft = inBytesLeft*3+1;
- char *out = (char *)malloc(outBytesLeft);
- char *out_p = out;
- iconv_t cd = iconv_open("UTF-8", "CP932");
- while (1) {
- int ret = iconv(cd, (char **)&in, &inBytesLeft, &out_p, &outBytesLeft);
- if (ret != -1) {
- break;
- }
- else if (errno == EILSEQ) {
- in += 2;
- inBytesLeft -= 2;
- *out_p++ = '?';
- outBytesLeft--;
- }
- else break;
- }
- iconv_close(cd);
- if (!out_p) {
- free(out);
- return NULL;
- }
- *out_p = 0;
- /* U+301C, U+2016, U+2212 should be converted to U+FF5E, U+2225, U+FF0D */
- out_p = out;
- while ((out_p = strstr(out_p, "\xe3\x80\x9c"))) {
- *out_p = 0xef;
- *(out_p+1) = 0xbd;
- *(out_p+2) = 0x9e;
- }
- out_p = out;
- while ((out_p = strstr(out_p, "\xe2\x80\x96"))) {
- *out_p = 0xe2;
- *(out_p+1) = 0x88;
- *(out_p+2) = 0xa5;
- }
- out_p = out;
- while ((out_p = strstr(out_p, "\xe2\x88\x92"))) {
- *out_p = 0xef;
- *(out_p+1) = 0xbc;
- *(out_p+2) = 0x8d;
- }
- return out;
- #endif
- }
- char *convertUTF8ToShiftJIS(const char *in, size_t inLength)
- {
- #if defined(__APPLE__)
- CFStringRef inStr = CFStringCreateWithBytes(
- NULL,
- (const UInt8 *)in,
- inLength,
- kCFStringEncodingUTF8,
- false
- );
- if (!inStr) return NULL;
- CFIndex outLength;
- CFRange range = CFRangeMake(0, CFStringGetLength(inStr));
- CFStringGetBytes(inStr, range, kCFStringEncodingDOSJapanese, '?', false, NULL, 0, &outLength);
- char *out = calloc(outLength+1, 1);
- CFStringGetBytes(inStr, range, kCFStringEncodingDOSJapanese, '?', false, (UInt8 *)out, outLength, NULL);
- CFRelease(inStr);
- return out;
- #elif defined(_WIN32)
- int wideLength = MultiByteToWideChar(CP_UTF8, 0, in, inLength, NULL, 0);
- if (wideLength == 0) return NULL;
- LPWSTR inWideStr = (LPWSTR)malloc(wideLength * sizeof(WCHAR));
- wideLength = MultiByteToWideChar(CP_UTF8, 0, in, inLength, inWideStr, wideLength);
- int outLength = WideCharToMultiByte(932, 0, inWideStr, wideLength, NULL, 0, NULL, NULL);
- char *out = calloc(outLength+1, 1);
- WideCharToMultiByte(932, 0, inWideStr, wideLength, out, outLength, NULL, NULL);
- free(inWideStr);
- return out;
- #else
- size_t inBytesLeft, outBytesLeft;
- inBytesLeft = inLength;
- outBytesLeft = inBytesLeft*3+1;
- char *out = (char *)malloc(outBytesLeft);
- char *out_p = out;
- iconv_t cd = iconv_open("CP932", "UTF-8");
- while (1) {
- int ret = iconv(cd, (char **)&in, &inBytesLeft, &out_p, &outBytesLeft);
- if (ret != -1) {
- break;
- }
- else if (errno == EILSEQ) {
- in++;
- inBytesLeft--;
- *out_p++ = '?';
- outBytesLeft--;
- }
- else break;
- }
- iconv_close(cd);
- if (!out_p) {
- free(out);
- return NULL;
- }
- *out_p = 0;
- return out;
- #endif
- }
- char *convertUTF8ToShiftJISWithNCR(const char *in, size_t inLength)
- {
- #if defined(__APPLE__)
- CFStringRef inStr = CFStringCreateWithBytes(
- NULL,
- (const UInt8 *)in,
- inLength,
- kCFStringEncodingUTF8,
- false
- );
- if (!inStr) return NULL;
- char *out = NULL;
- size_t bytesWritten = 0;
- CFIndex start = 0;
- CFIndex inputChars = CFStringGetLength(inStr);
- while (start < inputChars) {
- CFIndex outLength;
- CFRange range = CFRangeMake(start, inputChars - start);
- CFStringGetBytes(inStr, range, kCFStringEncodingDOSJapanese, 0, false, NULL, 0, &outLength);
- if (outLength) {
- out = realloc(out, bytesWritten+outLength+1);
- CFIndex converted = CFStringGetBytes(inStr, range, kCFStringEncodingDOSJapanese, 0, false, (UInt8 *)(out+bytesWritten), outLength, &outLength);
- if (converted) {
- start += converted;
- bytesWritten += outLength;
- }
- if (start >= inputChars) break;
- }
- unsigned int c1 = CFStringGetCharacterAtIndex(inStr, start++);
- if (c1 >= 0xd800 && c1 <= 0xdbff) {
- unsigned int c2 = CFStringGetCharacterAtIndex(inStr, start++);
- c1 = c1 - 0xd800;
- c2 = c2 - 0xdc00;
- c1 = 0x10000 + (c1 << 10) | c2;
- }
- out = realloc(out, bytesWritten+10+1);
- snprintf(out+bytesWritten, 11, "&#%u;", c1);
- bytesWritten += strlen(out+bytesWritten);
- }
- if (out) out[bytesWritten] = 0;
- CFRelease(inStr);
- return out;
- #elif defined(_WIN32)
- int wideLength = MultiByteToWideChar(CP_UTF8, 0, in, inLength, NULL, 0);
- if (wideLength == 0) return NULL;
- LPWSTR inWideStr = (LPWSTR)malloc(wideLength * sizeof(WCHAR));
- wideLength = MultiByteToWideChar(CP_UTF8, 0, in, inLength, inWideStr, wideLength);
- BOOL isLossy;
- char *out = NULL;
- int outLength = WideCharToMultiByte(932, 0, inWideStr, wideLength, NULL, 0, NULL, &isLossy);
- if (!isLossy) {
- out = calloc(outLength+1, 1);
- WideCharToMultiByte(932, 0, inWideStr, wideLength, out, outLength, NULL, NULL);
- } else {
- int bytesWritten = 0;
- int start = 0;
- while (start < wideLength) {
- int lengthToConvert;
- for (lengthToConvert = 1; start+lengthToConvert < wideLength; lengthToConvert++) {
- WideCharToMultiByte(932, WC_NO_BEST_FIT_CHARS, inWideStr+start, lengthToConvert, NULL, 0, NULL, &isLossy);
- if (isLossy) {
- lengthToConvert--;
- break;
- }
- }
- if (lengthToConvert) {
- int outLength = WideCharToMultiByte(932, WC_NO_BEST_FIT_CHARS, inWideStr+start, lengthToConvert, NULL, 0, NULL, NULL);
- out = realloc(out, bytesWritten+outLength+1);
- bytesWritten += WideCharToMultiByte(932, WC_NO_BEST_FIT_CHARS, inWideStr+start, lengthToConvert, out+bytesWritten, outLength, NULL, NULL);
- start += lengthToConvert;
- }
- if (start >= wideLength) break;
- unsigned int c1 = inWideStr[start++];
- if (c1 >= 0xd800 && c1 <= 0xdbff) {
- unsigned int c2 = inWideStr[start++];
- c1 = c1 - 0xd800;
- c2 = c2 - 0xdc00;
- c1 = 0x10000 + (c1 << 10) | c2;
- }
- if (c1 == 0x301c) { /* wave dash */
- out = realloc(out, bytesWritten+3);
- strcpy(out+bytesWritten, "\x81\x60");
- bytesWritten += 2;
- } else {
- out = realloc(out, bytesWritten+10+1);
- snprintf(out+bytesWritten, 11, "&#%u;", c1);
- bytesWritten += strlen(out+bytesWritten);
- }
- }
- if (out) out[bytesWritten] = 0;
- }
- free(inWideStr);
- return out;
- #else
- size_t inBytesLeft, outBytesLeft;
- inBytesLeft = inLength;
- outBytesLeft = inBytesLeft*4;
- uint32_t *utf32 = (uint32_t *)malloc(outBytesLeft);
- uint32_t *utf32_p = utf32;
- const char *codeset = "UCS-4-INTERNAL";
- iconv_t cd = iconv_open(codeset, "UTF-8");
- if (cd == (iconv_t)-1) {
- int x = 1;
- if (*(char *)&x) {
- codeset = "UCS-4LE";
- } else {
- codeset = "UCS-4BE";
- }
- cd = iconv_open(codeset, "UTF-8");
- }
- while (1) {
- int ret = iconv(cd, (char **)&in, &inBytesLeft, (char **)&utf32_p, &outBytesLeft);
- if (ret != -1) {
- break;
- }
- else if (errno == EILSEQ) {
- in++;
- inBytesLeft--;
- }
- else break;
- }
- iconv_close(cd);
- if (!utf32_p) {
- free(utf32);
- return NULL;
- }
- size_t numUnicodeWords = utf32_p - utf32;
- inBytesLeft = numUnicodeWords * 4;
- outBytesLeft = inBytesLeft*3 + 1; /* 4-byte utf-32 char may turn into 10-byte "&#xxxxxxx;" sequence - 3x multiplier is safe */
- char *out = (char *)malloc(outBytesLeft);
- char *out_p = out;
- int numInvalidConversions = 0;
- utf32_p = utf32;
- cd = iconv_open("CP932", codeset);
- while (1) {
- int ret = iconv(cd, (char **)&utf32_p, &inBytesLeft, &out_p, &outBytesLeft);
- if (ret != -1) {
- if (ret > 0) numInvalidConversions = ret;
- break;
- }
- else if (errno == EILSEQ) {
- if (*utf32_p == 0x301c) { /* wave dash */
- memcpy(out_p, "\x81\x60", 2);
- outBytesLeft -= 2;
- out_p += 2;
- } else {
- snprintf(out_p, outBytesLeft, "&#%u;", *utf32_p);
- outBytesLeft -= strlen(out_p);
- out_p += strlen(out_p);
- }
- utf32_p++;
- inBytesLeft -= 4;
- }
- else break;
- }
- if (numInvalidConversions) {
- /* fallback to per character conversion mode - FreeBSD/NetBSD iconv? */
- outBytesLeft = numUnicodeWords * 4 * 3 + 1;
- out_p = out;
- utf32_p = utf32;
- for (size_t wordsLeft = numUnicodeWords; wordsLeft > 0; wordsLeft--) {
- inBytesLeft = 4;
- int ret = iconv(cd, (char **)&utf32_p, &inBytesLeft, &out_p, &outBytesLeft);
- if (ret != -1 || (ret == -1 && errno == EILSEQ)) {
- if (ret != 0) {
- if (ret == -1) {
- utf32_p++;
- inBytesLeft -= 4;
- } else {
- outBytesLeft += 2;
- out_p -= 2;
- }
- if (*(utf32_p-1) == 0x301c) { /* wave dash */
- memcpy(out_p, "\x81\x60", 2);
- outBytesLeft -= 2;
- out_p += 2;
- } else {
- snprintf(out_p, outBytesLeft, "&#%u;", *(utf32_p-1));
- outBytesLeft -= strlen(out_p);
- out_p += strlen(out_p);
- }
- }
- continue;
- }
- else break;
- }
- }
- iconv_close(cd);
- free(utf32);
- if (!out_p) {
- free(out);
- return NULL;
- }
- *out_p = 0;
- return out;
- #endif
- }
|