stringEncodingConverter.c 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. #include <string.h>
  2. #include <stdlib.h>
  3. #include <stdio.h>
  4. #if defined(__APPLE__)
  5. #include <CoreFoundation/CoreFoundation.h>
  6. #elif defined(_WIN32)
  7. #include <windows.h>
  8. #else
  9. #include <iconv.h>
  10. #include <errno.h>
  11. #include <stdint.h>
  12. #endif
  13. #include "stringEncodingConverter.h"
  14. char *convertShiftJISToUTF8(const char *in, size_t inLength)
  15. {
  16. #if defined(__APPLE__)
  17. CFStringRef inStr = CFStringCreateWithBytes(
  18. NULL,
  19. (const UInt8 *)in,
  20. inLength,
  21. kCFStringEncodingDOSJapanese,
  22. false
  23. );
  24. if (!inStr) return NULL;
  25. CFIndex outLength;
  26. CFRange range = CFRangeMake(0, CFStringGetLength(inStr));
  27. CFStringGetBytes(inStr, range, kCFStringEncodingUTF8, '?', false, NULL, 0, &outLength);
  28. char *out = calloc(outLength+1, 1);
  29. CFStringGetBytes(inStr, range, kCFStringEncodingUTF8, '?', false, (UInt8 *)out, outLength, NULL);
  30. CFRelease(inStr);
  31. return out;
  32. #elif defined(_WIN32)
  33. int wideLength = MultiByteToWideChar(932, 0, in, inLength, NULL, 0);
  34. if (wideLength == 0) return NULL;
  35. LPWSTR inWideStr = (LPWSTR)malloc(wideLength * sizeof(WCHAR));
  36. wideLength = MultiByteToWideChar(932, 0, in, inLength, inWideStr, wideLength);
  37. int outLength = WideCharToMultiByte(CP_UTF8, 0, inWideStr, wideLength, NULL, 0, NULL, NULL);
  38. char *out = calloc(outLength+1, 1);
  39. WideCharToMultiByte(CP_UTF8, 0, inWideStr, wideLength, out, outLength, NULL, NULL);
  40. free(inWideStr);
  41. return out;
  42. #else
  43. size_t inBytesLeft, outBytesLeft;
  44. inBytesLeft = inLength;
  45. outBytesLeft = inBytesLeft*3+1;
  46. char *out = (char *)malloc(outBytesLeft);
  47. char *out_p = out;
  48. iconv_t cd = iconv_open("UTF-8", "CP932");
  49. while (1) {
  50. int ret = iconv(cd, (char **)&in, &inBytesLeft, &out_p, &outBytesLeft);
  51. if (ret != -1) {
  52. break;
  53. }
  54. else if (errno == EILSEQ) {
  55. in += 2;
  56. inBytesLeft -= 2;
  57. *out_p++ = '?';
  58. outBytesLeft--;
  59. }
  60. else break;
  61. }
  62. iconv_close(cd);
  63. if (!out_p) {
  64. free(out);
  65. return NULL;
  66. }
  67. *out_p = 0;
  68. /* U+301C, U+2016, U+2212 should be converted to U+FF5E, U+2225, U+FF0D */
  69. out_p = out;
  70. while ((out_p = strstr(out_p, "\xe3\x80\x9c"))) {
  71. *out_p = 0xef;
  72. *(out_p+1) = 0xbd;
  73. *(out_p+2) = 0x9e;
  74. }
  75. out_p = out;
  76. while ((out_p = strstr(out_p, "\xe2\x80\x96"))) {
  77. *out_p = 0xe2;
  78. *(out_p+1) = 0x88;
  79. *(out_p+2) = 0xa5;
  80. }
  81. out_p = out;
  82. while ((out_p = strstr(out_p, "\xe2\x88\x92"))) {
  83. *out_p = 0xef;
  84. *(out_p+1) = 0xbc;
  85. *(out_p+2) = 0x8d;
  86. }
  87. return out;
  88. #endif
  89. }
  90. char *convertUTF8ToShiftJIS(const char *in, size_t inLength)
  91. {
  92. #if defined(__APPLE__)
  93. CFStringRef inStr = CFStringCreateWithBytes(
  94. NULL,
  95. (const UInt8 *)in,
  96. inLength,
  97. kCFStringEncodingUTF8,
  98. false
  99. );
  100. if (!inStr) return NULL;
  101. CFIndex outLength;
  102. CFRange range = CFRangeMake(0, CFStringGetLength(inStr));
  103. CFStringGetBytes(inStr, range, kCFStringEncodingDOSJapanese, '?', false, NULL, 0, &outLength);
  104. char *out = calloc(outLength+1, 1);
  105. CFStringGetBytes(inStr, range, kCFStringEncodingDOSJapanese, '?', false, (UInt8 *)out, outLength, NULL);
  106. CFRelease(inStr);
  107. return out;
  108. #elif defined(_WIN32)
  109. int wideLength = MultiByteToWideChar(CP_UTF8, 0, in, inLength, NULL, 0);
  110. if (wideLength == 0) return NULL;
  111. LPWSTR inWideStr = (LPWSTR)malloc(wideLength * sizeof(WCHAR));
  112. wideLength = MultiByteToWideChar(CP_UTF8, 0, in, inLength, inWideStr, wideLength);
  113. int outLength = WideCharToMultiByte(932, 0, inWideStr, wideLength, NULL, 0, NULL, NULL);
  114. char *out = calloc(outLength+1, 1);
  115. WideCharToMultiByte(932, 0, inWideStr, wideLength, out, outLength, NULL, NULL);
  116. free(inWideStr);
  117. return out;
  118. #else
  119. size_t inBytesLeft, outBytesLeft;
  120. inBytesLeft = inLength;
  121. outBytesLeft = inBytesLeft*3+1;
  122. char *out = (char *)malloc(outBytesLeft);
  123. char *out_p = out;
  124. iconv_t cd = iconv_open("CP932", "UTF-8");
  125. while (1) {
  126. int ret = iconv(cd, (char **)&in, &inBytesLeft, &out_p, &outBytesLeft);
  127. if (ret != -1) {
  128. break;
  129. }
  130. else if (errno == EILSEQ) {
  131. in++;
  132. inBytesLeft--;
  133. *out_p++ = '?';
  134. outBytesLeft--;
  135. }
  136. else break;
  137. }
  138. iconv_close(cd);
  139. if (!out_p) {
  140. free(out);
  141. return NULL;
  142. }
  143. *out_p = 0;
  144. return out;
  145. #endif
  146. }
  147. char *convertUTF8ToShiftJISWithNCR(const char *in, size_t inLength)
  148. {
  149. #if defined(__APPLE__)
  150. CFStringRef inStr = CFStringCreateWithBytes(
  151. NULL,
  152. (const UInt8 *)in,
  153. inLength,
  154. kCFStringEncodingUTF8,
  155. false
  156. );
  157. if (!inStr) return NULL;
  158. char *out = NULL;
  159. size_t bytesWritten = 0;
  160. CFIndex start = 0;
  161. CFIndex inputChars = CFStringGetLength(inStr);
  162. while (start < inputChars) {
  163. CFIndex outLength;
  164. CFRange range = CFRangeMake(start, inputChars - start);
  165. CFStringGetBytes(inStr, range, kCFStringEncodingDOSJapanese, 0, false, NULL, 0, &outLength);
  166. if (outLength) {
  167. out = realloc(out, bytesWritten+outLength+1);
  168. CFIndex converted = CFStringGetBytes(inStr, range, kCFStringEncodingDOSJapanese, 0, false, (UInt8 *)(out+bytesWritten), outLength, &outLength);
  169. if (converted) {
  170. start += converted;
  171. bytesWritten += outLength;
  172. }
  173. if (start >= inputChars) break;
  174. }
  175. unsigned int c1 = CFStringGetCharacterAtIndex(inStr, start++);
  176. if (c1 >= 0xd800 && c1 <= 0xdbff) {
  177. unsigned int c2 = CFStringGetCharacterAtIndex(inStr, start++);
  178. c1 = c1 - 0xd800;
  179. c2 = c2 - 0xdc00;
  180. c1 = 0x10000 + (c1 << 10) | c2;
  181. }
  182. out = realloc(out, bytesWritten+10+1);
  183. snprintf(out+bytesWritten, 11, "&#%u;", c1);
  184. bytesWritten += strlen(out+bytesWritten);
  185. }
  186. if (out) out[bytesWritten] = 0;
  187. CFRelease(inStr);
  188. return out;
  189. #elif defined(_WIN32)
  190. int wideLength = MultiByteToWideChar(CP_UTF8, 0, in, inLength, NULL, 0);
  191. if (wideLength == 0) return NULL;
  192. LPWSTR inWideStr = (LPWSTR)malloc(wideLength * sizeof(WCHAR));
  193. wideLength = MultiByteToWideChar(CP_UTF8, 0, in, inLength, inWideStr, wideLength);
  194. BOOL isLossy;
  195. char *out = NULL;
  196. int outLength = WideCharToMultiByte(932, 0, inWideStr, wideLength, NULL, 0, NULL, &isLossy);
  197. if (!isLossy) {
  198. out = calloc(outLength+1, 1);
  199. WideCharToMultiByte(932, 0, inWideStr, wideLength, out, outLength, NULL, NULL);
  200. } else {
  201. int bytesWritten = 0;
  202. int start = 0;
  203. while (start < wideLength) {
  204. int lengthToConvert;
  205. for (lengthToConvert = 1; start+lengthToConvert < wideLength; lengthToConvert++) {
  206. WideCharToMultiByte(932, WC_NO_BEST_FIT_CHARS, inWideStr+start, lengthToConvert, NULL, 0, NULL, &isLossy);
  207. if (isLossy) {
  208. lengthToConvert--;
  209. break;
  210. }
  211. }
  212. if (lengthToConvert) {
  213. int outLength = WideCharToMultiByte(932, WC_NO_BEST_FIT_CHARS, inWideStr+start, lengthToConvert, NULL, 0, NULL, NULL);
  214. out = realloc(out, bytesWritten+outLength+1);
  215. bytesWritten += WideCharToMultiByte(932, WC_NO_BEST_FIT_CHARS, inWideStr+start, lengthToConvert, out+bytesWritten, outLength, NULL, NULL);
  216. start += lengthToConvert;
  217. }
  218. if (start >= wideLength) break;
  219. unsigned int c1 = inWideStr[start++];
  220. if (c1 >= 0xd800 && c1 <= 0xdbff) {
  221. unsigned int c2 = inWideStr[start++];
  222. c1 = c1 - 0xd800;
  223. c2 = c2 - 0xdc00;
  224. c1 = 0x10000 + (c1 << 10) | c2;
  225. }
  226. if (c1 == 0x301c) { /* wave dash */
  227. out = realloc(out, bytesWritten+3);
  228. strcpy(out+bytesWritten, "\x81\x60");
  229. bytesWritten += 2;
  230. } else {
  231. out = realloc(out, bytesWritten+10+1);
  232. snprintf(out+bytesWritten, 11, "&#%u;", c1);
  233. bytesWritten += strlen(out+bytesWritten);
  234. }
  235. }
  236. if (out) out[bytesWritten] = 0;
  237. }
  238. free(inWideStr);
  239. return out;
  240. #else
  241. size_t inBytesLeft, outBytesLeft;
  242. inBytesLeft = inLength;
  243. outBytesLeft = inBytesLeft*4;
  244. uint32_t *utf32 = (uint32_t *)malloc(outBytesLeft);
  245. uint32_t *utf32_p = utf32;
  246. const char *codeset = "UCS-4-INTERNAL";
  247. iconv_t cd = iconv_open(codeset, "UTF-8");
  248. if (cd == (iconv_t)-1) {
  249. int x = 1;
  250. if (*(char *)&x) {
  251. codeset = "UCS-4LE";
  252. } else {
  253. codeset = "UCS-4BE";
  254. }
  255. cd = iconv_open(codeset, "UTF-8");
  256. }
  257. while (1) {
  258. int ret = iconv(cd, (char **)&in, &inBytesLeft, (char **)&utf32_p, &outBytesLeft);
  259. if (ret != -1) {
  260. break;
  261. }
  262. else if (errno == EILSEQ) {
  263. in++;
  264. inBytesLeft--;
  265. }
  266. else break;
  267. }
  268. iconv_close(cd);
  269. if (!utf32_p) {
  270. free(utf32);
  271. return NULL;
  272. }
  273. size_t numUnicodeWords = utf32_p - utf32;
  274. inBytesLeft = numUnicodeWords * 4;
  275. outBytesLeft = inBytesLeft*3 + 1; /* 4-byte utf-32 char may turn into 10-byte "&#xxxxxxx;" sequence - 3x multiplier is safe */
  276. char *out = (char *)malloc(outBytesLeft);
  277. char *out_p = out;
  278. int numInvalidConversions = 0;
  279. utf32_p = utf32;
  280. cd = iconv_open("CP932", codeset);
  281. while (1) {
  282. int ret = iconv(cd, (char **)&utf32_p, &inBytesLeft, &out_p, &outBytesLeft);
  283. if (ret != -1) {
  284. if (ret > 0) numInvalidConversions = ret;
  285. break;
  286. }
  287. else if (errno == EILSEQ) {
  288. if (*utf32_p == 0x301c) { /* wave dash */
  289. memcpy(out_p, "\x81\x60", 2);
  290. outBytesLeft -= 2;
  291. out_p += 2;
  292. } else {
  293. snprintf(out_p, outBytesLeft, "&#%u;", *utf32_p);
  294. outBytesLeft -= strlen(out_p);
  295. out_p += strlen(out_p);
  296. }
  297. utf32_p++;
  298. inBytesLeft -= 4;
  299. }
  300. else break;
  301. }
  302. if (numInvalidConversions) {
  303. /* fallback to per character conversion mode - FreeBSD/NetBSD iconv? */
  304. outBytesLeft = numUnicodeWords * 4 * 3 + 1;
  305. out_p = out;
  306. utf32_p = utf32;
  307. for (size_t wordsLeft = numUnicodeWords; wordsLeft > 0; wordsLeft--) {
  308. inBytesLeft = 4;
  309. int ret = iconv(cd, (char **)&utf32_p, &inBytesLeft, &out_p, &outBytesLeft);
  310. if (ret != -1 || (ret == -1 && errno == EILSEQ)) {
  311. if (ret != 0) {
  312. if (ret == -1) {
  313. utf32_p++;
  314. inBytesLeft -= 4;
  315. } else {
  316. outBytesLeft += 2;
  317. out_p -= 2;
  318. }
  319. if (*(utf32_p-1) == 0x301c) { /* wave dash */
  320. memcpy(out_p, "\x81\x60", 2);
  321. outBytesLeft -= 2;
  322. out_p += 2;
  323. } else {
  324. snprintf(out_p, outBytesLeft, "&#%u;", *(utf32_p-1));
  325. outBytesLeft -= strlen(out_p);
  326. out_p += strlen(out_p);
  327. }
  328. }
  329. continue;
  330. }
  331. else break;
  332. }
  333. }
  334. iconv_close(cd);
  335. free(utf32);
  336. if (!out_p) {
  337. free(out);
  338. return NULL;
  339. }
  340. *out_p = 0;
  341. return out;
  342. #endif
  343. }