iconv.c 11 KB


  1. /* Character set conversion.
  2. Copyright (C) 1999-2001, 2007, 2009-2023 Free Software Foundation, Inc.
  3. This file is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU Lesser General Public License as
  5. published by the Free Software Foundation; either version 2.1 of the
  6. License, or (at your option) any later version.
  7. This file is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU Lesser General Public License for more details.
  11. You should have received a copy of the GNU Lesser General Public License
  12. along with this program. If not, see <https://www.gnu.org/licenses/>. */
  13. #include <config.h>
  14. /* Specification. */
  15. #include <iconv.h>
  16. #include <stddef.h>
  17. #if REPLACE_ICONV_UTF
  18. # include <errno.h>
  19. # include <stdint.h>
  20. # include <stdlib.h>
  21. # include "unistr.h"
  22. #endif
  23. #if REPLACE_ICONV_UTF
  24. /* UTF-{16,32}{BE,LE} converters taken from GNU libiconv 1.11. */
  25. /* Return code if invalid. (xxx_mbtowc) */
  26. # define RET_ILSEQ -1
  27. /* Return code if no bytes were read. (xxx_mbtowc) */
  28. # define RET_TOOFEW -2
  29. /* Return code if invalid. (xxx_wctomb) */
  30. # define RET_ILUNI -1
  31. /* Return code if output buffer is too small. (xxx_wctomb, xxx_reset) */
  32. # define RET_TOOSMALL -2
  33. /*
  34. * UTF-16BE
  35. */
  36. /* Specification: RFC 2781 */
  37. static int
  38. utf16be_mbtowc (ucs4_t *pwc, const unsigned char *s, size_t n)
  39. {
  40. if (n >= 2)
  41. {
  42. ucs4_t wc = (s[0] << 8) + s[1];
  43. if (wc >= 0xd800 && wc < 0xdc00)
  44. {
  45. if (n >= 4)
  46. {
  47. ucs4_t wc2 = (s[2] << 8) + s[3];
  48. if (!(wc2 >= 0xdc00 && wc2 < 0xe000))
  49. return RET_ILSEQ;
  50. *pwc = 0x10000 + ((wc - 0xd800) << 10) + (wc2 - 0xdc00);
  51. return 4;
  52. }
  53. }
  54. else if (wc >= 0xdc00 && wc < 0xe000)
  55. {
  56. return RET_ILSEQ;
  57. }
  58. else
  59. {
  60. *pwc = wc;
  61. return 2;
  62. }
  63. }
  64. return RET_TOOFEW;
  65. }
  66. static int
  67. utf16be_wctomb (unsigned char *r, ucs4_t wc, size_t n)
  68. {
  69. if (!(wc >= 0xd800 && wc < 0xe000))
  70. {
  71. if (wc < 0x10000)
  72. {
  73. if (n >= 2)
  74. {
  75. r[0] = (unsigned char) (wc >> 8);
  76. r[1] = (unsigned char) wc;
  77. return 2;
  78. }
  79. else
  80. return RET_TOOSMALL;
  81. }
  82. else if (wc < 0x110000)
  83. {
  84. if (n >= 4)
  85. {
  86. ucs4_t wc1 = 0xd800 + ((wc - 0x10000) >> 10);
  87. ucs4_t wc2 = 0xdc00 + ((wc - 0x10000) & 0x3ff);
  88. r[0] = (unsigned char) (wc1 >> 8);
  89. r[1] = (unsigned char) wc1;
  90. r[2] = (unsigned char) (wc2 >> 8);
  91. r[3] = (unsigned char) wc2;
  92. return 4;
  93. }
  94. else
  95. return RET_TOOSMALL;
  96. }
  97. }
  98. return RET_ILUNI;
  99. }
  100. /*
  101. * UTF-16LE
  102. */
  103. /* Specification: RFC 2781 */
  104. static int
  105. utf16le_mbtowc (ucs4_t *pwc, const unsigned char *s, size_t n)
  106. {
  107. if (n >= 2)
  108. {
  109. ucs4_t wc = s[0] + (s[1] << 8);
  110. if (wc >= 0xd800 && wc < 0xdc00)
  111. {
  112. if (n >= 4)
  113. {
  114. ucs4_t wc2 = s[2] + (s[3] << 8);
  115. if (!(wc2 >= 0xdc00 && wc2 < 0xe000))
  116. return RET_ILSEQ;
  117. *pwc = 0x10000 + ((wc - 0xd800) << 10) + (wc2 - 0xdc00);
  118. return 4;
  119. }
  120. }
  121. else if (wc >= 0xdc00 && wc < 0xe000)
  122. {
  123. return RET_ILSEQ;
  124. }
  125. else
  126. {
  127. *pwc = wc;
  128. return 2;
  129. }
  130. }
  131. return RET_TOOFEW;
  132. }
  133. static int
  134. utf16le_wctomb (unsigned char *r, ucs4_t wc, size_t n)
  135. {
  136. if (!(wc >= 0xd800 && wc < 0xe000))
  137. {
  138. if (wc < 0x10000)
  139. {
  140. if (n >= 2)
  141. {
  142. r[0] = (unsigned char) wc;
  143. r[1] = (unsigned char) (wc >> 8);
  144. return 2;
  145. }
  146. else
  147. return RET_TOOSMALL;
  148. }
  149. else if (wc < 0x110000)
  150. {
  151. if (n >= 4)
  152. {
  153. ucs4_t wc1 = 0xd800 + ((wc - 0x10000) >> 10);
  154. ucs4_t wc2 = 0xdc00 + ((wc - 0x10000) & 0x3ff);
  155. r[0] = (unsigned char) wc1;
  156. r[1] = (unsigned char) (wc1 >> 8);
  157. r[2] = (unsigned char) wc2;
  158. r[3] = (unsigned char) (wc2 >> 8);
  159. return 4;
  160. }
  161. else
  162. return RET_TOOSMALL;
  163. }
  164. }
  165. return RET_ILUNI;
  166. }
  167. /*
  168. * UTF-32BE
  169. */
  170. /* Specification: Unicode 3.1 Standard Annex #19 */
  171. static int
  172. utf32be_mbtowc (ucs4_t *pwc, const unsigned char *s, size_t n)
  173. {
  174. if (n >= 4)
  175. {
  176. ucs4_t wc = (s[0] << 24) + (s[1] << 16) + (s[2] << 8) + s[3];
  177. if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000))
  178. {
  179. *pwc = wc;
  180. return 4;
  181. }
  182. else
  183. return RET_ILSEQ;
  184. }
  185. return RET_TOOFEW;
  186. }
  187. static int
  188. utf32be_wctomb (unsigned char *r, ucs4_t wc, size_t n)
  189. {
  190. if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000))
  191. {
  192. if (n >= 4)
  193. {
  194. r[0] = 0;
  195. r[1] = (unsigned char) (wc >> 16);
  196. r[2] = (unsigned char) (wc >> 8);
  197. r[3] = (unsigned char) wc;
  198. return 4;
  199. }
  200. else
  201. return RET_TOOSMALL;
  202. }
  203. return RET_ILUNI;
  204. }
  205. /*
  206. * UTF-32LE
  207. */
  208. /* Specification: Unicode 3.1 Standard Annex #19 */
  209. static int
  210. utf32le_mbtowc (ucs4_t *pwc, const unsigned char *s, size_t n)
  211. {
  212. if (n >= 4)
  213. {
  214. ucs4_t wc = s[0] + (s[1] << 8) + (s[2] << 16) + (s[3] << 24);
  215. if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000))
  216. {
  217. *pwc = wc;
  218. return 4;
  219. }
  220. else
  221. return RET_ILSEQ;
  222. }
  223. return RET_TOOFEW;
  224. }
  225. static int
  226. utf32le_wctomb (unsigned char *r, ucs4_t wc, size_t n)
  227. {
  228. if (wc < 0x110000 && !(wc >= 0xd800 && wc < 0xe000))
  229. {
  230. if (n >= 4)
  231. {
  232. r[0] = (unsigned char) wc;
  233. r[1] = (unsigned char) (wc >> 8);
  234. r[2] = (unsigned char) (wc >> 16);
  235. r[3] = 0;
  236. return 4;
  237. }
  238. else
  239. return RET_TOOSMALL;
  240. }
  241. return RET_ILUNI;
  242. }
  243. #endif
  244. size_t
  245. rpl_iconv (iconv_t cd,
  246. ICONV_CONST char **inbuf, size_t *inbytesleft,
  247. char **outbuf, size_t *outbytesleft)
  248. #undef iconv
  249. {
  250. #if REPLACE_ICONV_UTF
  251. switch ((uintptr_t) cd)
  252. {
  253. {
  254. int (*xxx_wctomb) (unsigned char *, ucs4_t, size_t);
  255. case (uintptr_t) _ICONV_UTF8_UTF16BE:
  256. xxx_wctomb = utf16be_wctomb;
  257. goto loop_from_utf8;
  258. case (uintptr_t) _ICONV_UTF8_UTF16LE:
  259. xxx_wctomb = utf16le_wctomb;
  260. goto loop_from_utf8;
  261. case (uintptr_t) _ICONV_UTF8_UTF32BE:
  262. xxx_wctomb = utf32be_wctomb;
  263. goto loop_from_utf8;
  264. case (uintptr_t) _ICONV_UTF8_UTF32LE:
  265. xxx_wctomb = utf32le_wctomb;
  266. goto loop_from_utf8;
  267. loop_from_utf8:
  268. if (inbuf == NULL || *inbuf == NULL)
  269. return 0;
  270. {
  271. ICONV_CONST char *inptr = *inbuf;
  272. size_t inleft = *inbytesleft;
  273. char *outptr = *outbuf;
  274. size_t outleft = *outbytesleft;
  275. size_t res = 0;
  276. while (inleft > 0)
  277. {
  278. ucs4_t uc;
  279. int m = u8_mbtoucr (&uc, (const uint8_t *) inptr, inleft);
  280. if (m <= 0)
  281. {
  282. if (m == -1)
  283. {
  284. errno = EILSEQ;
  285. res = (size_t)(-1);
  286. break;
  287. }
  288. if (m == -2)
  289. {
  290. errno = EINVAL;
  291. res = (size_t)(-1);
  292. break;
  293. }
  294. abort ();
  295. }
  296. else
  297. {
  298. int n = xxx_wctomb ((uint8_t *) outptr, uc, outleft);
  299. if (n < 0)
  300. {
  301. if (n == RET_ILUNI)
  302. {
  303. errno = EILSEQ;
  304. res = (size_t)(-1);
  305. break;
  306. }
  307. if (n == RET_TOOSMALL)
  308. {
  309. errno = E2BIG;
  310. res = (size_t)(-1);
  311. break;
  312. }
  313. abort ();
  314. }
  315. else
  316. {
  317. inptr += m;
  318. inleft -= m;
  319. outptr += n;
  320. outleft -= n;
  321. }
  322. }
  323. }
  324. *inbuf = inptr;
  325. *inbytesleft = inleft;
  326. *outbuf = outptr;
  327. *outbytesleft = outleft;
  328. return res;
  329. }
  330. }
  331. {
  332. int (*xxx_mbtowc) (ucs4_t *, const unsigned char *, size_t);
  333. case (uintptr_t) _ICONV_UTF16BE_UTF8:
  334. xxx_mbtowc = utf16be_mbtowc;
  335. goto loop_to_utf8;
  336. case (uintptr_t) _ICONV_UTF16LE_UTF8:
  337. xxx_mbtowc = utf16le_mbtowc;
  338. goto loop_to_utf8;
  339. case (uintptr_t) _ICONV_UTF32BE_UTF8:
  340. xxx_mbtowc = utf32be_mbtowc;
  341. goto loop_to_utf8;
  342. case (uintptr_t) _ICONV_UTF32LE_UTF8:
  343. xxx_mbtowc = utf32le_mbtowc;
  344. goto loop_to_utf8;
  345. loop_to_utf8:
  346. if (inbuf == NULL || *inbuf == NULL)
  347. return 0;
  348. {
  349. ICONV_CONST char *inptr = *inbuf;
  350. size_t inleft = *inbytesleft;
  351. char *outptr = *outbuf;
  352. size_t outleft = *outbytesleft;
  353. size_t res = 0;
  354. while (inleft > 0)
  355. {
  356. ucs4_t uc;
  357. int m = xxx_mbtowc (&uc, (const uint8_t *) inptr, inleft);
  358. if (m <= 0)
  359. {
  360. if (m == RET_ILSEQ)
  361. {
  362. errno = EILSEQ;
  363. res = (size_t)(-1);
  364. break;
  365. }
  366. if (m == RET_TOOFEW)
  367. {
  368. errno = EINVAL;
  369. res = (size_t)(-1);
  370. break;
  371. }
  372. abort ();
  373. }
  374. else
  375. {
  376. int n = u8_uctomb ((uint8_t *) outptr, uc, outleft);
  377. if (n < 0)
  378. {
  379. if (n == -1)
  380. {
  381. errno = EILSEQ;
  382. res = (size_t)(-1);
  383. break;
  384. }
  385. if (n == -2)
  386. {
  387. errno = E2BIG;
  388. res = (size_t)(-1);
  389. break;
  390. }
  391. abort ();
  392. }
  393. else
  394. {
  395. inptr += m;
  396. inleft -= m;
  397. outptr += n;
  398. outleft -= n;
  399. }
  400. }
  401. }
  402. *inbuf = inptr;
  403. *inbytesleft = inleft;
  404. *outbuf = outptr;
  405. *outbytesleft = outleft;
  406. return res;
  407. }
  408. }
  409. }
  410. #endif
  411. return iconv (cd, inbuf, inbytesleft, outbuf, outbytesleft);
  412. }