utfconv.c 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. /*
  2. * This program is free software; you can redistribute it and/or
  3. * modify it under the terms of the GNU General Public License
  4. * as published by the Free Software Foundation; either version 2
  5. * of the License, or (at your option) any later version.
  6. *
  7. * This program is distributed in the hope that it will be useful,
  8. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. * GNU General Public License for more details.
  11. *
  12. * You should have received a copy of the GNU General Public License
  13. * along with this program; if not, write to the Free Software Foundation,
  14. * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  15. *
  16. * The Original Code is Copyright (C) 2012 Blender Foundation.
  17. * All rights reserved.
  18. *
  19. */
  20. #include "utfconv.h"
  21. size_t count_utf_8_from_16(const wchar_t *string16)
  22. {
  23. int i;
  24. size_t count = 0;
  25. wchar_t u = 0;
  26. if (!string16) {
  27. return 0;
  28. }
  29. for (i = 0; (u = string16[i]); i++) {
  30. if (u < 0x0080) {
  31. count += 1;
  32. }
  33. else {
  34. if (u < 0x0800) {
  35. count += 2;
  36. }
  37. else {
  38. if (u < 0xD800) {
  39. count += 3;
  40. }
  41. else {
  42. if (u < 0xDC00) {
  43. i++;
  44. if ((u = string16[i]) == 0) {
  45. break;
  46. }
  47. if (u >= 0xDC00 && u < 0xE000) {
  48. count += 4;
  49. }
  50. }
  51. else {
  52. if (u < 0xE000) {
  53. /*illigal*/;
  54. }
  55. else {
  56. count += 3;
  57. }
  58. }
  59. }
  60. }
  61. }
  62. }
  63. return ++count;
  64. }
  65. size_t count_utf_16_from_8(const char *string8)
  66. {
  67. size_t count = 0;
  68. char u;
  69. char type = 0;
  70. unsigned int u32 = 0;
  71. if (!string8)
  72. return 0;
  73. for (; (u = *string8); string8++) {
  74. if (type == 0) {
  75. if ((u & 0x01 << 7) == 0) {
  76. count++;
  77. u32 = 0;
  78. continue;
  79. } // 1 utf-8 char
  80. if ((u & 0x07 << 5) == 0xC0) {
  81. type = 1;
  82. u32 = u & 0x1F;
  83. continue;
  84. } // 2 utf-8 char
  85. if ((u & 0x0F << 4) == 0xE0) {
  86. type = 2;
  87. u32 = u & 0x0F;
  88. continue;
  89. } // 3 utf-8 char
  90. if ((u & 0x1F << 3) == 0xF0) {
  91. type = 3;
  92. u32 = u & 0x07;
  93. continue;
  94. } // 4 utf-8 char
  95. continue;
  96. }
  97. else {
  98. if ((u & 0xC0) == 0x80) {
  99. u32 = (u32 << 6) | (u & 0x3F);
  100. type--;
  101. }
  102. else {
  103. u32 = 0;
  104. type = 0;
  105. }
  106. }
  107. if (type == 0) {
  108. if ((0 < u32 && u32 < 0xD800) || (0xE000 <= u32 && u32 < 0x10000))
  109. count++;
  110. else if (0x10000 <= u32 && u32 < 0x110000)
  111. count += 2;
  112. u32 = 0;
  113. }
  114. }
  115. return ++count;
  116. }
  117. int conv_utf_16_to_8(const wchar_t *in16, char *out8, size_t size8)
  118. {
  119. char *out8end = out8 + size8;
  120. wchar_t u = 0;
  121. int err = 0;
  122. if (!size8 || !in16 || !out8)
  123. return UTF_ERROR_NULL_IN;
  124. out8end--;
  125. for (; out8 < out8end && (u = *in16); in16++, out8++) {
  126. if (u < 0x0080) {
  127. *out8 = u;
  128. }
  129. else if (u < 0x0800) {
  130. if (out8 + 1 >= out8end)
  131. break;
  132. *out8++ = (0x3 << 6) | (0x1F & (u >> 6));
  133. *out8 = (0x1 << 7) | (0x3F & (u));
  134. }
  135. else if (u < 0xD800 || u >= 0xE000) {
  136. if (out8 + 2 >= out8end)
  137. break;
  138. *out8++ = (0x7 << 5) | (0xF & (u >> 12));
  139. *out8++ = (0x1 << 7) | (0x3F & (u >> 6));
  140. *out8 = (0x1 << 7) | (0x3F & (u));
  141. }
  142. else if (u < 0xDC00) {
  143. wchar_t u2 = *++in16;
  144. if (!u2)
  145. break;
  146. if (u2 >= 0xDC00 && u2 < 0xE000) {
  147. if (out8 + 3 >= out8end)
  148. break;
  149. else {
  150. unsigned int uc = 0x10000 + (u2 - 0xDC00) + ((u - 0xD800) << 10);
  151. *out8++ = (0xF << 4) | (0x7 & (uc >> 18));
  152. *out8++ = (0x1 << 7) | (0x3F & (uc >> 12));
  153. *out8++ = (0x1 << 7) | (0x3F & (uc >> 6));
  154. *out8 = (0x1 << 7) | (0x3F & (uc));
  155. }
  156. }
  157. else {
  158. out8--;
  159. err |= UTF_ERROR_ILLCHAR;
  160. }
  161. }
  162. else if (u < 0xE000) {
  163. out8--;
  164. err |= UTF_ERROR_ILLCHAR;
  165. }
  166. }
  167. *out8 = *out8end = 0;
  168. if (*in16)
  169. err |= UTF_ERROR_SMALL;
  170. return err;
  171. }
  172. int conv_utf_8_to_16(const char *in8, wchar_t *out16, size_t size16)
  173. {
  174. char u;
  175. char type = 0;
  176. unsigned int u32 = 0;
  177. wchar_t *out16end = out16 + size16;
  178. int err = 0;
  179. if (!size16 || !in8 || !out16)
  180. return UTF_ERROR_NULL_IN;
  181. out16end--;
  182. for (; out16 < out16end && (u = *in8); in8++) {
  183. if (type == 0) {
  184. if ((u & 0x01 << 7) == 0) {
  185. *out16 = u;
  186. out16++;
  187. u32 = 0;
  188. continue;
  189. } // 1 utf-8 char
  190. if ((u & 0x07 << 5) == 0xC0) {
  191. type = 1;
  192. u32 = u & 0x1F;
  193. continue;
  194. } // 2 utf-8 char
  195. if ((u & 0x0F << 4) == 0xE0) {
  196. type = 2;
  197. u32 = u & 0x0F;
  198. continue;
  199. } // 3 utf-8 char
  200. if ((u & 0x1F << 3) == 0xF0) {
  201. type = 3;
  202. u32 = u & 0x07;
  203. continue;
  204. } // 4 utf-8 char
  205. err |= UTF_ERROR_ILLCHAR;
  206. continue;
  207. }
  208. else {
  209. if ((u & 0xC0) == 0x80) {
  210. u32 = (u32 << 6) | (u & 0x3F);
  211. type--;
  212. }
  213. else {
  214. u32 = 0;
  215. type = 0;
  216. err |= UTF_ERROR_ILLSEQ;
  217. }
  218. }
  219. if (type == 0) {
  220. if ((0 < u32 && u32 < 0xD800) || (0xE000 <= u32 && u32 < 0x10000)) {
  221. *out16 = u32;
  222. out16++;
  223. }
  224. else if (0x10000 <= u32 && u32 < 0x110000) {
  225. if (out16 + 1 >= out16end)
  226. break;
  227. u32 -= 0x10000;
  228. *out16 = 0xD800 + (u32 >> 10);
  229. out16++;
  230. *out16 = 0xDC00 + (u32 & 0x3FF);
  231. out16++;
  232. }
  233. u32 = 0;
  234. }
  235. }
  236. *out16 = *out16end = 0;
  237. if (*in8)
  238. err |= UTF_ERROR_SMALL;
  239. return err;
  240. }
  241. /* UNUSED FUNCTIONS */
  242. #if 0
  243. static int is_ascii(const char *in8)
  244. {
  245. for (; *in8; in8++)
  246. if (0x80 & *in8)
  247. return 0;
  248. return 1;
  249. }
  250. static void utf_8_cut_end(char *inout8, size_t maxcutpoint)
  251. {
  252. char *cur = inout8 + maxcutpoint;
  253. char cc;
  254. if (!inout8)
  255. return;
  256. cc = *cur;
  257. }
  258. #endif
  259. char *alloc_utf_8_from_16(const wchar_t *in16, size_t add)
  260. {
  261. size_t bsize = count_utf_8_from_16(in16);
  262. char *out8 = NULL;
  263. if (!bsize)
  264. return NULL;
  265. out8 = (char *)malloc(sizeof(char) * (bsize + add));
  266. conv_utf_16_to_8(in16, out8, bsize);
  267. return out8;
  268. }
  269. wchar_t *alloc_utf16_from_8(const char *in8, size_t add)
  270. {
  271. size_t bsize = count_utf_16_from_8(in8);
  272. wchar_t *out16 = NULL;
  273. if (!bsize)
  274. return NULL;
  275. out16 = (wchar_t *)malloc(sizeof(wchar_t) * (bsize + add));
  276. conv_utf_8_to_16(in8, out16, bsize);
  277. return out16;
  278. }