iconv_ucs.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
  1. /*-
  2. * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  3. *
  4. * Copyright (c) 2003, 2005 Ryuichiro Imura
  5. * All rights reserved.
  6. *
  7. * Redistribution and use in source and binary forms, with or without
  8. * modification, are permitted provided that the following conditions
  9. * are met:
  10. * 1. Redistributions of source code must retain the above copyright
  11. * notice, this list of conditions and the following disclaimer.
  12. * 2. Redistributions in binary form must reproduce the above copyright
  13. * notice, this list of conditions and the following disclaimer in the
  14. * documentation and/or other materials provided with the distribution.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  17. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  20. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26. * SUCH DAMAGE.
  27. */
  28. #include <sys/cdefs.h>
  29. __FBSDID("$FreeBSD$");
  30. #include <sys/param.h>
  31. #include <sys/kernel.h>
  32. #include <sys/systm.h>
  33. #include <sys/malloc.h>
  34. #include <sys/iconv.h>
  35. #include "iconv_converter_if.h"
  36. /*
  37. * "UCS" converter
  38. */
  39. #define KICONV_UCS_COMBINE 0x1
  40. #define KICONV_UCS_FROM_UTF8 0x2
  41. #define KICONV_UCS_TO_UTF8 0x4
  42. #define KICONV_UCS_FROM_LE 0x8
  43. #define KICONV_UCS_TO_LE 0x10
  44. #define KICONV_UCS_FROM_UTF16 0x20
  45. #define KICONV_UCS_TO_UTF16 0x40
  46. #define KICONV_UCS_UCS4 0x80
  47. #define ENCODING_UTF16 "UTF-16BE"
  48. #define ENCODING_UTF8 "UTF-8"
  49. static struct {
  50. const char *name;
  51. int from_flag, to_flag;
  52. } unicode_family[] = {
  53. { "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 },
  54. { "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE },
  55. { "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 },
  56. { "UTF-16LE", KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
  57. KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
  58. { NULL, 0, 0 }
  59. };
  60. static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
  61. static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
  62. static uint32_t encode_surrogate(uint32_t code);
  63. static uint32_t decode_surrogate(const u_char *ucs);
  64. #ifdef MODULE_DEPEND
  65. MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
  66. #endif
  67. /*
  68. * UCS converter instance
  69. */
  70. struct iconv_ucs {
  71. KOBJ_FIELDS;
  72. int convtype;
  73. struct iconv_cspair * d_csp;
  74. struct iconv_cspair * d_cspf;
  75. void * f_ctp;
  76. void * t_ctp;
  77. void * ctype;
  78. };
  79. static int
  80. iconv_ucs_open(struct iconv_converter_class *dcp,
  81. struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
  82. {
  83. struct iconv_ucs *dp;
  84. int i;
  85. const char *from, *to;
  86. dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
  87. to = csp->cp_to;
  88. from = cspf ? cspf->cp_from : csp->cp_from;
  89. dp->convtype = 0;
  90. if (cspf)
  91. dp->convtype |= KICONV_UCS_COMBINE;
  92. for (i = 0; unicode_family[i].name; i++) {
  93. if (strcasecmp(from, unicode_family[i].name) == 0)
  94. dp->convtype |= unicode_family[i].from_flag;
  95. if (strcasecmp(to, unicode_family[i].name) == 0)
  96. dp->convtype |= unicode_family[i].to_flag;
  97. }
  98. if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
  99. dp->convtype |= KICONV_UCS_UCS4;
  100. else
  101. dp->convtype &= ~KICONV_UCS_UCS4;
  102. dp->f_ctp = dp->t_ctp = NULL;
  103. if (dp->convtype & KICONV_UCS_COMBINE) {
  104. if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
  105. (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
  106. iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
  107. }
  108. if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
  109. (dp->convtype & KICONV_UCS_TO_LE) == 0) {
  110. iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
  111. }
  112. }
  113. dp->ctype = NULL;
  114. if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
  115. iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
  116. dp->d_csp = csp;
  117. if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
  118. if (cspf) {
  119. dp->d_cspf = cspf;
  120. cspf->cp_refcount++;
  121. } else
  122. csp->cp_refcount++;
  123. }
  124. if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
  125. csp->cp_refcount++;
  126. *dpp = (void*)dp;
  127. return 0;
  128. }
  129. static int
  130. iconv_ucs_close(void *data)
  131. {
  132. struct iconv_ucs *dp = data;
  133. if (dp->f_ctp)
  134. iconv_close(dp->f_ctp);
  135. if (dp->t_ctp)
  136. iconv_close(dp->t_ctp);
  137. if (dp->ctype)
  138. iconv_close(dp->ctype);
  139. if (dp->d_cspf)
  140. dp->d_cspf->cp_refcount--;
  141. else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
  142. dp->d_csp->cp_refcount--;
  143. if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
  144. dp->d_csp->cp_refcount--;
  145. kobj_delete((struct kobj*)data, M_ICONV);
  146. return 0;
  147. }
  148. static int
  149. iconv_ucs_conv(void *d2p, const char **inbuf,
  150. size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
  151. int convchar, int casetype)
  152. {
  153. struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
  154. int ret = 0, i;
  155. size_t in, on, ir, or, inlen, outlen, ucslen;
  156. const char *src, *p;
  157. char *dst;
  158. u_char ucs[4], *q;
  159. uint32_t code;
  160. if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
  161. return 0;
  162. ir = in = *inbytesleft;
  163. or = on = *outbytesleft;
  164. src = *inbuf;
  165. dst = *outbuf;
  166. while (ir > 0 && or > 0) {
  167. /*
  168. * The first half of conversion.
  169. * (convert any code into ENCODING_UNICODE)
  170. */
  171. code = 0;
  172. p = src;
  173. if (dp->convtype & KICONV_UCS_FROM_UTF8) {
  174. /* convert UTF-8 to ENCODING_UNICODE */
  175. inlen = 0;
  176. code = utf8_to_ucs4(p, &inlen, ir);
  177. if (code == 0) {
  178. ret = -1;
  179. break;
  180. }
  181. if (casetype == KICONV_FROM_LOWER && dp->ctype) {
  182. code = towlower(code, dp->ctype);
  183. } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
  184. code = towupper(code, dp->ctype);
  185. }
  186. if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
  187. /* reserved for utf-16 surrogate pair */
  188. /* invalid unicode */
  189. ret = -1;
  190. break;
  191. }
  192. if (inlen == 4) {
  193. if (dp->convtype & KICONV_UCS_UCS4) {
  194. ucslen = 4;
  195. code = encode_surrogate(code);
  196. } else {
  197. /* can't handle with ucs-2 */
  198. ret = -1;
  199. break;
  200. }
  201. } else {
  202. ucslen = 2;
  203. }
  204. /* save UCS-4 into ucs[] */
  205. for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
  206. *q++ = (code >> (i << 3)) & 0xff;
  207. } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
  208. /* convert local code to ENCODING_UNICODE */
  209. ucslen = 4;
  210. inlen = ir;
  211. q = ucs;
  212. ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
  213. &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
  214. if (ret)
  215. break;
  216. inlen = ir - inlen;
  217. ucslen = 4 - ucslen;
  218. } else {
  219. /* src code is a proper subset of ENCODING_UNICODE */
  220. q = ucs;
  221. if (dp->convtype & KICONV_UCS_FROM_LE) {
  222. *q = *(p + 1);
  223. *(q + 1) = *p;
  224. p += 2;
  225. } else {
  226. *q = *p++;
  227. *(q + 1) = *p++;
  228. }
  229. if ((*q & 0xfc) == 0xd8) {
  230. if (dp->convtype & KICONV_UCS_UCS4 &&
  231. dp->convtype & KICONV_UCS_FROM_UTF16) {
  232. inlen = ucslen = 4;
  233. } else {
  234. /* invalid unicode */
  235. ret = -1;
  236. break;
  237. }
  238. } else {
  239. inlen = ucslen = 2;
  240. }
  241. if (ir < inlen) {
  242. ret = -1;
  243. break;
  244. }
  245. if (ucslen == 4) {
  246. q += 2;
  247. if (dp->convtype & KICONV_UCS_FROM_LE) {
  248. *q = *(p + 1);
  249. *(q + 1) = *p;
  250. } else {
  251. *q = *p++;
  252. *(q + 1) = *p;
  253. }
  254. if ((*q & 0xfc) != 0xdc) {
  255. /* invalid unicode */
  256. ret = -1;
  257. break;
  258. }
  259. }
  260. }
  261. /*
  262. * The second half of conversion.
  263. * (convert ENCODING_UNICODE into any code)
  264. */
  265. p = ucs;
  266. if (dp->convtype & KICONV_UCS_TO_UTF8) {
  267. q = (u_char *)dst;
  268. if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
  269. /* decode surrogate pair */
  270. code = decode_surrogate(p);
  271. } else {
  272. code = (ucs[0] << 8) | ucs[1];
  273. }
  274. if (casetype == KICONV_LOWER && dp->ctype) {
  275. code = towlower(code, dp->ctype);
  276. } else if (casetype == KICONV_UPPER && dp->ctype) {
  277. code = towupper(code, dp->ctype);
  278. }
  279. outlen = 0;
  280. if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
  281. ret = -1;
  282. break;
  283. }
  284. src += inlen;
  285. ir -= inlen;
  286. dst += outlen;
  287. or -= outlen;
  288. } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
  289. ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
  290. &or, casetype & (KICONV_LOWER | KICONV_UPPER));
  291. if (ret)
  292. break;
  293. src += inlen;
  294. ir -= inlen;
  295. } else {
  296. /* dst code is a proper subset of ENCODING_UNICODE */
  297. if (or < ucslen) {
  298. ret = -1;
  299. break;
  300. }
  301. src += inlen;
  302. ir -= inlen;
  303. or -= ucslen;
  304. if (dp->convtype & KICONV_UCS_TO_LE) {
  305. *dst++ = *(p + 1);
  306. *dst++ = *p;
  307. p += 2;
  308. } else {
  309. *dst++ = *p++;
  310. *dst++ = *p++;
  311. }
  312. if (ucslen == 4) {
  313. if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
  314. (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
  315. ret = -1;
  316. break;
  317. }
  318. if (dp->convtype & KICONV_UCS_TO_LE) {
  319. *dst++ = *(p + 1);
  320. *dst++ = *p;
  321. } else {
  322. *dst++ = *p++;
  323. *dst++ = *p;
  324. }
  325. }
  326. }
  327. if (convchar == 1)
  328. break;
  329. }
  330. *inbuf += in - ir;
  331. *outbuf += on - or;
  332. *inbytesleft -= in - ir;
  333. *outbytesleft -= on - or;
  334. return (ret);
  335. }
  336. static int
  337. iconv_ucs_init(struct iconv_converter_class *dcp)
  338. {
  339. int error;
  340. error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
  341. if (error)
  342. return (error);
  343. error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
  344. if (error)
  345. return (error);
  346. return (0);
  347. }
  348. static int
  349. iconv_ucs_done(struct iconv_converter_class *dcp)
  350. {
  351. return (0);
  352. }
  353. static const char *
  354. iconv_ucs_name(struct iconv_converter_class *dcp)
  355. {
  356. return (ENCODING_UNICODE);
  357. }
  358. static kobj_method_t iconv_ucs_methods[] = {
  359. KOBJMETHOD(iconv_converter_open, iconv_ucs_open),
  360. KOBJMETHOD(iconv_converter_close, iconv_ucs_close),
  361. KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv),
  362. KOBJMETHOD(iconv_converter_init, iconv_ucs_init),
  363. KOBJMETHOD(iconv_converter_done, iconv_ucs_done),
  364. KOBJMETHOD(iconv_converter_name, iconv_ucs_name),
  365. {0, 0}
  366. };
  367. KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
  368. static uint32_t
  369. utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
  370. {
  371. size_t i, w = 0;
  372. uint32_t ucs4 = 0;
  373. /*
  374. * get leading 1 byte from utf-8
  375. */
  376. if ((*src & 0x80) == 0) {
  377. /*
  378. * leading 1 bit is "0"
  379. * utf-8: 0xxxxxxx
  380. * ucs-4: 00000000 00000000 00000000 0xxxxxxx
  381. */
  382. w = 1;
  383. /* get trailing 7 bits */
  384. ucs4 = *src & 0x7f;
  385. } else if ((*src & 0xe0) == 0xc0) {
  386. /*
  387. * leading 3 bits are "110"
  388. * utf-8: 110xxxxx 10yyyyyy
  389. * ucs-4: 00000000 00000000 00000xxx xxyyyyyy
  390. */
  391. w = 2;
  392. /* get trailing 5 bits */
  393. ucs4 = *src & 0x1f;
  394. } else if ((*src & 0xf0) == 0xe0) {
  395. /*
  396. * leading 4 bits are "1110"
  397. * utf-8: 1110xxxx 10yyyyyy 10zzzzzz
  398. * ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
  399. */
  400. w = 3;
  401. /* get trailing 4 bits */
  402. ucs4 = *src & 0x0f;
  403. } else if ((*src & 0xf8) == 0xf0) {
  404. /*
  405. * leading 5 bits are "11110"
  406. * utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
  407. * ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
  408. */
  409. w = 4;
  410. /* get trailing 3 bits */
  411. ucs4 = *src & 0x07;
  412. } else {
  413. /* out of utf-16 range or having illegal bits */
  414. return (0);
  415. }
  416. if (srclen < w)
  417. return (0);
  418. /*
  419. * get left parts from utf-8
  420. */
  421. for (i = 1 ; i < w ; i++) {
  422. if ((*(src + i) & 0xc0) != 0x80) {
  423. /* invalid: leading 2 bits are not "10" */
  424. return (0);
  425. }
  426. /* concatenate trailing 6 bits into ucs4 */
  427. ucs4 <<= 6;
  428. ucs4 |= *(src + i) & 0x3f;
  429. }
  430. *utf8width = w;
  431. return (ucs4);
  432. }
  433. static u_char *
  434. ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
  435. {
  436. u_char lead, *p;
  437. size_t i, w;
  438. /*
  439. * determine utf-8 width and leading bits
  440. */
  441. if (ucs4 < 0x80) {
  442. w = 1;
  443. lead = 0; /* "0" */
  444. } else if (ucs4 < 0x800) {
  445. w = 2;
  446. lead = 0xc0; /* "11" */
  447. } else if (ucs4 < 0x10000) {
  448. w = 3;
  449. lead = 0xe0; /* "111" */
  450. } else if (ucs4 < 0x200000) {
  451. w = 4;
  452. lead = 0xf0; /* "1111" */
  453. } else {
  454. return (NULL);
  455. }
  456. if (dstlen < w)
  457. return (NULL);
  458. /*
  459. * construct utf-8
  460. */
  461. p = dst;
  462. for (i = w - 1 ; i >= 1 ; i--) {
  463. /* get trailing 6 bits and put it with leading bit as "1" */
  464. *(p + i) = (ucs4 & 0x3f) | 0x80;
  465. ucs4 >>= 6;
  466. }
  467. *p = ucs4 | lead;
  468. *utf8width = w;
  469. return (p);
  470. }
  471. static uint32_t
  472. encode_surrogate(uint32_t code)
  473. {
  474. return ((((code - 0x10000) << 6) & 0x3ff0000) |
  475. ((code - 0x10000) & 0x3ff) | 0xd800dc00);
  476. }
  477. static uint32_t
  478. decode_surrogate(const u_char *ucs)
  479. {
  480. return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
  481. ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
  482. }