123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539 |
- /*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2003, 2005 Ryuichiro Imura
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
- #include <sys/cdefs.h>
- __FBSDID("$FreeBSD$");
- #include <sys/param.h>
- #include <sys/kernel.h>
- #include <sys/systm.h>
- #include <sys/malloc.h>
- #include <sys/iconv.h>
- #include "iconv_converter_if.h"
- /*
- * "UCS" converter
- */
- #define KICONV_UCS_COMBINE 0x1
- #define KICONV_UCS_FROM_UTF8 0x2
- #define KICONV_UCS_TO_UTF8 0x4
- #define KICONV_UCS_FROM_LE 0x8
- #define KICONV_UCS_TO_LE 0x10
- #define KICONV_UCS_FROM_UTF16 0x20
- #define KICONV_UCS_TO_UTF16 0x40
- #define KICONV_UCS_UCS4 0x80
- #define ENCODING_UTF16 "UTF-16BE"
- #define ENCODING_UTF8 "UTF-8"
- static struct {
- const char *name;
- int from_flag, to_flag;
- } unicode_family[] = {
- { "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 },
- { "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE },
- { "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 },
- { "UTF-16LE", KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
- KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
- { NULL, 0, 0 }
- };
- static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
- static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
- static uint32_t encode_surrogate(uint32_t code);
- static uint32_t decode_surrogate(const u_char *ucs);
- #ifdef MODULE_DEPEND
- MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
- #endif
- /*
- * UCS converter instance
- */
- struct iconv_ucs {
- KOBJ_FIELDS;
- int convtype;
- struct iconv_cspair * d_csp;
- struct iconv_cspair * d_cspf;
- void * f_ctp;
- void * t_ctp;
- void * ctype;
- };
- static int
- iconv_ucs_open(struct iconv_converter_class *dcp,
- struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
- {
- struct iconv_ucs *dp;
- int i;
- const char *from, *to;
- dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
- to = csp->cp_to;
- from = cspf ? cspf->cp_from : csp->cp_from;
- dp->convtype = 0;
- if (cspf)
- dp->convtype |= KICONV_UCS_COMBINE;
- for (i = 0; unicode_family[i].name; i++) {
- if (strcasecmp(from, unicode_family[i].name) == 0)
- dp->convtype |= unicode_family[i].from_flag;
- if (strcasecmp(to, unicode_family[i].name) == 0)
- dp->convtype |= unicode_family[i].to_flag;
- }
- if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
- dp->convtype |= KICONV_UCS_UCS4;
- else
- dp->convtype &= ~KICONV_UCS_UCS4;
- dp->f_ctp = dp->t_ctp = NULL;
- if (dp->convtype & KICONV_UCS_COMBINE) {
- if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
- (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
- iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
- }
- if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
- (dp->convtype & KICONV_UCS_TO_LE) == 0) {
- iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
- }
- }
- dp->ctype = NULL;
- if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
- iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
- dp->d_csp = csp;
- if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
- if (cspf) {
- dp->d_cspf = cspf;
- cspf->cp_refcount++;
- } else
- csp->cp_refcount++;
- }
- if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
- csp->cp_refcount++;
- *dpp = (void*)dp;
- return 0;
- }
- static int
- iconv_ucs_close(void *data)
- {
- struct iconv_ucs *dp = data;
- if (dp->f_ctp)
- iconv_close(dp->f_ctp);
- if (dp->t_ctp)
- iconv_close(dp->t_ctp);
- if (dp->ctype)
- iconv_close(dp->ctype);
- if (dp->d_cspf)
- dp->d_cspf->cp_refcount--;
- else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
- dp->d_csp->cp_refcount--;
- if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
- dp->d_csp->cp_refcount--;
- kobj_delete((struct kobj*)data, M_ICONV);
- return 0;
- }
- static int
- iconv_ucs_conv(void *d2p, const char **inbuf,
- size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
- int convchar, int casetype)
- {
- struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
- int ret = 0, i;
- size_t in, on, ir, or, inlen, outlen, ucslen;
- const char *src, *p;
- char *dst;
- u_char ucs[4], *q;
- uint32_t code;
- if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
- return 0;
- ir = in = *inbytesleft;
- or = on = *outbytesleft;
- src = *inbuf;
- dst = *outbuf;
- while (ir > 0 && or > 0) {
- /*
- * The first half of conversion.
- * (convert any code into ENCODING_UNICODE)
- */
- code = 0;
- p = src;
- if (dp->convtype & KICONV_UCS_FROM_UTF8) {
- /* convert UTF-8 to ENCODING_UNICODE */
- inlen = 0;
- code = utf8_to_ucs4(p, &inlen, ir);
- if (code == 0) {
- ret = -1;
- break;
- }
- if (casetype == KICONV_FROM_LOWER && dp->ctype) {
- code = towlower(code, dp->ctype);
- } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
- code = towupper(code, dp->ctype);
- }
- if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
- /* reserved for utf-16 surrogate pair */
- /* invalid unicode */
- ret = -1;
- break;
- }
- if (inlen == 4) {
- if (dp->convtype & KICONV_UCS_UCS4) {
- ucslen = 4;
- code = encode_surrogate(code);
- } else {
- /* can't handle with ucs-2 */
- ret = -1;
- break;
- }
- } else {
- ucslen = 2;
- }
- /* save UCS-4 into ucs[] */
- for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
- *q++ = (code >> (i << 3)) & 0xff;
- } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
- /* convert local code to ENCODING_UNICODE */
- ucslen = 4;
- inlen = ir;
- q = ucs;
- ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
- &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
- if (ret)
- break;
- inlen = ir - inlen;
- ucslen = 4 - ucslen;
- } else {
- /* src code is a proper subset of ENCODING_UNICODE */
- q = ucs;
- if (dp->convtype & KICONV_UCS_FROM_LE) {
- *q = *(p + 1);
- *(q + 1) = *p;
- p += 2;
- } else {
- *q = *p++;
- *(q + 1) = *p++;
- }
- if ((*q & 0xfc) == 0xd8) {
- if (dp->convtype & KICONV_UCS_UCS4 &&
- dp->convtype & KICONV_UCS_FROM_UTF16) {
- inlen = ucslen = 4;
- } else {
- /* invalid unicode */
- ret = -1;
- break;
- }
- } else {
- inlen = ucslen = 2;
- }
- if (ir < inlen) {
- ret = -1;
- break;
- }
- if (ucslen == 4) {
- q += 2;
- if (dp->convtype & KICONV_UCS_FROM_LE) {
- *q = *(p + 1);
- *(q + 1) = *p;
- } else {
- *q = *p++;
- *(q + 1) = *p;
- }
- if ((*q & 0xfc) != 0xdc) {
- /* invalid unicode */
- ret = -1;
- break;
- }
- }
- }
- /*
- * The second half of conversion.
- * (convert ENCODING_UNICODE into any code)
- */
- p = ucs;
- if (dp->convtype & KICONV_UCS_TO_UTF8) {
- q = (u_char *)dst;
- if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
- /* decode surrogate pair */
- code = decode_surrogate(p);
- } else {
- code = (ucs[0] << 8) | ucs[1];
- }
- if (casetype == KICONV_LOWER && dp->ctype) {
- code = towlower(code, dp->ctype);
- } else if (casetype == KICONV_UPPER && dp->ctype) {
- code = towupper(code, dp->ctype);
- }
- outlen = 0;
- if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
- ret = -1;
- break;
- }
- src += inlen;
- ir -= inlen;
- dst += outlen;
- or -= outlen;
- } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
- ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
- &or, casetype & (KICONV_LOWER | KICONV_UPPER));
- if (ret)
- break;
- src += inlen;
- ir -= inlen;
- } else {
- /* dst code is a proper subset of ENCODING_UNICODE */
- if (or < ucslen) {
- ret = -1;
- break;
- }
- src += inlen;
- ir -= inlen;
- or -= ucslen;
- if (dp->convtype & KICONV_UCS_TO_LE) {
- *dst++ = *(p + 1);
- *dst++ = *p;
- p += 2;
- } else {
- *dst++ = *p++;
- *dst++ = *p++;
- }
- if (ucslen == 4) {
- if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
- (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
- ret = -1;
- break;
- }
- if (dp->convtype & KICONV_UCS_TO_LE) {
- *dst++ = *(p + 1);
- *dst++ = *p;
- } else {
- *dst++ = *p++;
- *dst++ = *p;
- }
- }
- }
- if (convchar == 1)
- break;
- }
- *inbuf += in - ir;
- *outbuf += on - or;
- *inbytesleft -= in - ir;
- *outbytesleft -= on - or;
- return (ret);
- }
- static int
- iconv_ucs_init(struct iconv_converter_class *dcp)
- {
- int error;
- error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
- if (error)
- return (error);
- error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
- if (error)
- return (error);
- return (0);
- }
- static int
- iconv_ucs_done(struct iconv_converter_class *dcp)
- {
- return (0);
- }
- static const char *
- iconv_ucs_name(struct iconv_converter_class *dcp)
- {
- return (ENCODING_UNICODE);
- }
- static kobj_method_t iconv_ucs_methods[] = {
- KOBJMETHOD(iconv_converter_open, iconv_ucs_open),
- KOBJMETHOD(iconv_converter_close, iconv_ucs_close),
- KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv),
- KOBJMETHOD(iconv_converter_init, iconv_ucs_init),
- KOBJMETHOD(iconv_converter_done, iconv_ucs_done),
- KOBJMETHOD(iconv_converter_name, iconv_ucs_name),
- {0, 0}
- };
- KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
- static uint32_t
- utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
- {
- size_t i, w = 0;
- uint32_t ucs4 = 0;
- /*
- * get leading 1 byte from utf-8
- */
- if ((*src & 0x80) == 0) {
- /*
- * leading 1 bit is "0"
- * utf-8: 0xxxxxxx
- * ucs-4: 00000000 00000000 00000000 0xxxxxxx
- */
- w = 1;
- /* get trailing 7 bits */
- ucs4 = *src & 0x7f;
- } else if ((*src & 0xe0) == 0xc0) {
- /*
- * leading 3 bits are "110"
- * utf-8: 110xxxxx 10yyyyyy
- * ucs-4: 00000000 00000000 00000xxx xxyyyyyy
- */
- w = 2;
- /* get trailing 5 bits */
- ucs4 = *src & 0x1f;
- } else if ((*src & 0xf0) == 0xe0) {
- /*
- * leading 4 bits are "1110"
- * utf-8: 1110xxxx 10yyyyyy 10zzzzzz
- * ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
- */
- w = 3;
- /* get trailing 4 bits */
- ucs4 = *src & 0x0f;
- } else if ((*src & 0xf8) == 0xf0) {
- /*
- * leading 5 bits are "11110"
- * utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
- * ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
- */
- w = 4;
- /* get trailing 3 bits */
- ucs4 = *src & 0x07;
- } else {
- /* out of utf-16 range or having illegal bits */
- return (0);
- }
- if (srclen < w)
- return (0);
- /*
- * get left parts from utf-8
- */
- for (i = 1 ; i < w ; i++) {
- if ((*(src + i) & 0xc0) != 0x80) {
- /* invalid: leading 2 bits are not "10" */
- return (0);
- }
- /* concatenate trailing 6 bits into ucs4 */
- ucs4 <<= 6;
- ucs4 |= *(src + i) & 0x3f;
- }
- *utf8width = w;
- return (ucs4);
- }
- static u_char *
- ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
- {
- u_char lead, *p;
- size_t i, w;
- /*
- * determine utf-8 width and leading bits
- */
- if (ucs4 < 0x80) {
- w = 1;
- lead = 0; /* "0" */
- } else if (ucs4 < 0x800) {
- w = 2;
- lead = 0xc0; /* "11" */
- } else if (ucs4 < 0x10000) {
- w = 3;
- lead = 0xe0; /* "111" */
- } else if (ucs4 < 0x200000) {
- w = 4;
- lead = 0xf0; /* "1111" */
- } else {
- return (NULL);
- }
- if (dstlen < w)
- return (NULL);
- /*
- * construct utf-8
- */
- p = dst;
- for (i = w - 1 ; i >= 1 ; i--) {
- /* get trailing 6 bits and put it with leading bit as "1" */
- *(p + i) = (ucs4 & 0x3f) | 0x80;
- ucs4 >>= 6;
- }
- *p = ucs4 | lead;
- *utf8width = w;
- return (p);
- }
- static uint32_t
- encode_surrogate(uint32_t code)
- {
- return ((((code - 0x10000) << 6) & 0x3ff0000) |
- ((code - 0x10000) & 0x3ff) | 0xd800dc00);
- }
- static uint32_t
- decode_surrogate(const u_char *ucs)
- {
- return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
- ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
- }
|