striconveh.c 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188
  1. /* Character set conversion with error handling.
  2. Copyright (C) 2001-2010 Free Software Foundation, Inc.
  3. Written by Bruno Haible and Simon Josefsson.
  4. This program is free software: you can redistribute it and/or modify
  5. it under the terms of the GNU Lesser General Public License as published by
  6. the Free Software Foundation; either version 3 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public License
  13. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  14. #include <config.h>
  15. /* Specification. */
  16. #include "striconveh.h"
  17. #include <errno.h>
  18. #include <stdbool.h>
  19. #include <stdlib.h>
  20. #include <string.h>
  21. #if HAVE_ICONV
  22. # include <iconv.h>
  23. # include "unistr.h"
  24. #endif
  25. #include "c-strcase.h"
  26. #include "c-strcaseeq.h"
  27. #ifndef SIZE_MAX
  28. # define SIZE_MAX ((size_t) -1)
  29. #endif
  30. #if HAVE_ICONV
  31. /* The caller must provide an iconveh_t, not just an iconv_t, because when a
  32. conversion error occurs, we may have to determine the Unicode representation
  33. of the inconvertible character. */
  34. int
  35. iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
  36. {
  37. iconv_t cd;
  38. iconv_t cd1;
  39. iconv_t cd2;
  40. /* Avoid glibc-2.1 bug with EUC-KR. */
  41. # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
  42. if (c_strcasecmp (from_codeset, "EUC-KR") == 0
  43. || c_strcasecmp (to_codeset, "EUC-KR") == 0)
  44. {
  45. errno = EINVAL;
  46. return -1;
  47. }
  48. # endif
  49. cd = iconv_open (to_codeset, from_codeset);
  50. if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
  51. cd1 = (iconv_t)(-1);
  52. else
  53. {
  54. cd1 = iconv_open ("UTF-8", from_codeset);
  55. if (cd1 == (iconv_t)(-1))
  56. {
  57. int saved_errno = errno;
  58. if (cd != (iconv_t)(-1))
  59. iconv_close (cdp->cd);
  60. errno = saved_errno;
  61. return -1;
  62. }
  63. }
  64. if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
  65. # if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
  66. || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
  67. # endif
  68. )
  69. cd2 = (iconv_t)(-1);
  70. else
  71. {
  72. cd2 = iconv_open (to_codeset, "UTF-8");
  73. if (cd2 == (iconv_t)(-1))
  74. {
  75. int saved_errno = errno;
  76. if (cd1 != (iconv_t)(-1))
  77. iconv_close (cd1);
  78. if (cd != (iconv_t)(-1))
  79. iconv_close (cd);
  80. errno = saved_errno;
  81. return -1;
  82. }
  83. }
  84. cdp->cd = cd;
  85. cdp->cd1 = cd1;
  86. cdp->cd2 = cd2;
  87. return 0;
  88. }
  89. int
  90. iconveh_close (const iconveh_t *cd)
  91. {
  92. if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
  93. {
  94. /* Return -1, but preserve the errno from iconv_close. */
  95. int saved_errno = errno;
  96. if (cd->cd1 != (iconv_t)(-1))
  97. iconv_close (cd->cd1);
  98. if (cd->cd != (iconv_t)(-1))
  99. iconv_close (cd->cd);
  100. errno = saved_errno;
  101. return -1;
  102. }
  103. if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
  104. {
  105. /* Return -1, but preserve the errno from iconv_close. */
  106. int saved_errno = errno;
  107. if (cd->cd != (iconv_t)(-1))
  108. iconv_close (cd->cd);
  109. errno = saved_errno;
  110. return -1;
  111. }
  112. if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
  113. return -1;
  114. return 0;
  115. }
  116. /* iconv_carefully is like iconv, except that it stops as soon as it encounters
  117. a conversion error, and it returns in *INCREMENTED a boolean telling whether
  118. it has incremented the input pointers past the error location. */
  119. # if !defined _LIBICONV_VERSION && !defined __GLIBC__
  120. /* Irix iconv() inserts a NUL byte if it cannot convert.
  121. NetBSD iconv() inserts a question mark if it cannot convert.
  122. Only GNU libiconv and GNU libc are known to prefer to fail rather
  123. than doing a lossy conversion. */
  124. static size_t
  125. iconv_carefully (iconv_t cd,
  126. const char **inbuf, size_t *inbytesleft,
  127. char **outbuf, size_t *outbytesleft,
  128. bool *incremented)
  129. {
  130. const char *inptr = *inbuf;
  131. const char *inptr_end = inptr + *inbytesleft;
  132. char *outptr = *outbuf;
  133. size_t outsize = *outbytesleft;
  134. const char *inptr_before;
  135. size_t res;
  136. do
  137. {
  138. size_t insize;
  139. inptr_before = inptr;
  140. res = (size_t)(-1);
  141. for (insize = 1; inptr + insize <= inptr_end; insize++)
  142. {
  143. res = iconv (cd,
  144. (ICONV_CONST char **) &inptr, &insize,
  145. &outptr, &outsize);
  146. if (!(res == (size_t)(-1) && errno == EINVAL))
  147. break;
  148. /* iconv can eat up a shift sequence but give EINVAL while attempting
  149. to convert the first character. E.g. libiconv does this. */
  150. if (inptr > inptr_before)
  151. {
  152. res = 0;
  153. break;
  154. }
  155. }
  156. if (res == 0)
  157. {
  158. *outbuf = outptr;
  159. *outbytesleft = outsize;
  160. }
  161. }
  162. while (res == 0 && inptr < inptr_end);
  163. *inbuf = inptr;
  164. *inbytesleft = inptr_end - inptr;
  165. if (res != (size_t)(-1) && res > 0)
  166. {
  167. /* iconv() has already incremented INPTR. We cannot go back to a
  168. previous INPTR, otherwise the state inside CD would become invalid,
  169. if FROM_CODESET is a stateful encoding. So, tell the caller that
  170. *INBUF has already been incremented. */
  171. *incremented = (inptr > inptr_before);
  172. errno = EILSEQ;
  173. return (size_t)(-1);
  174. }
  175. else
  176. {
  177. *incremented = false;
  178. return res;
  179. }
  180. }
  181. # else
  182. # define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
  183. (*(incremented) = false, \
  184. iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
  185. # endif
  186. /* iconv_carefully_1 is like iconv_carefully, except that it stops after
  187. converting one character or one shift sequence. */
  188. static size_t
  189. iconv_carefully_1 (iconv_t cd,
  190. const char **inbuf, size_t *inbytesleft,
  191. char **outbuf, size_t *outbytesleft,
  192. bool *incremented)
  193. {
  194. const char *inptr_before = *inbuf;
  195. const char *inptr = inptr_before;
  196. const char *inptr_end = inptr_before + *inbytesleft;
  197. char *outptr = *outbuf;
  198. size_t outsize = *outbytesleft;
  199. size_t res = (size_t)(-1);
  200. size_t insize;
  201. for (insize = 1; inptr_before + insize <= inptr_end; insize++)
  202. {
  203. inptr = inptr_before;
  204. res = iconv (cd,
  205. (ICONV_CONST char **) &inptr, &insize,
  206. &outptr, &outsize);
  207. if (!(res == (size_t)(-1) && errno == EINVAL))
  208. break;
  209. /* iconv can eat up a shift sequence but give EINVAL while attempting
  210. to convert the first character. E.g. libiconv does this. */
  211. if (inptr > inptr_before)
  212. {
  213. res = 0;
  214. break;
  215. }
  216. }
  217. *inbuf = inptr;
  218. *inbytesleft = inptr_end - inptr;
  219. # if !defined _LIBICONV_VERSION && !defined __GLIBC__
  220. /* Irix iconv() inserts a NUL byte if it cannot convert.
  221. NetBSD iconv() inserts a question mark if it cannot convert.
  222. Only GNU libiconv and GNU libc are known to prefer to fail rather
  223. than doing a lossy conversion. */
  224. if (res != (size_t)(-1) && res > 0)
  225. {
  226. /* iconv() has already incremented INPTR. We cannot go back to a
  227. previous INPTR, otherwise the state inside CD would become invalid,
  228. if FROM_CODESET is a stateful encoding. So, tell the caller that
  229. *INBUF has already been incremented. */
  230. *incremented = (inptr > inptr_before);
  231. errno = EILSEQ;
  232. return (size_t)(-1);
  233. }
  234. # endif
  235. if (res != (size_t)(-1))
  236. {
  237. *outbuf = outptr;
  238. *outbytesleft = outsize;
  239. }
  240. *incremented = false;
  241. return res;
  242. }
  243. /* utf8conv_carefully is like iconv, except that
  244. - it converts from UTF-8 to UTF-8,
  245. - it stops as soon as it encounters a conversion error, and it returns
  246. in *INCREMENTED a boolean telling whether it has incremented the input
  247. pointers past the error location,
  248. - if one_character_only is true, it stops after converting one
  249. character. */
  250. static size_t
  251. utf8conv_carefully (bool one_character_only,
  252. const char **inbuf, size_t *inbytesleft,
  253. char **outbuf, size_t *outbytesleft,
  254. bool *incremented)
  255. {
  256. const char *inptr = *inbuf;
  257. size_t insize = *inbytesleft;
  258. char *outptr = *outbuf;
  259. size_t outsize = *outbytesleft;
  260. size_t res;
  261. res = 0;
  262. do
  263. {
  264. ucs4_t uc;
  265. int n;
  266. int m;
  267. n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
  268. if (n < 0)
  269. {
  270. errno = (n == -2 ? EINVAL : EILSEQ);
  271. n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
  272. inptr += n;
  273. insize -= n;
  274. res = (size_t)(-1);
  275. *incremented = true;
  276. break;
  277. }
  278. if (outsize == 0)
  279. {
  280. errno = E2BIG;
  281. res = (size_t)(-1);
  282. *incremented = false;
  283. break;
  284. }
  285. m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
  286. if (m == -2)
  287. {
  288. errno = E2BIG;
  289. res = (size_t)(-1);
  290. *incremented = false;
  291. break;
  292. }
  293. inptr += n;
  294. insize -= n;
  295. if (m == -1)
  296. {
  297. errno = EILSEQ;
  298. res = (size_t)(-1);
  299. *incremented = true;
  300. break;
  301. }
  302. outptr += m;
  303. outsize -= m;
  304. }
  305. while (!one_character_only && insize > 0);
  306. *inbuf = inptr;
  307. *inbytesleft = insize;
  308. *outbuf = outptr;
  309. *outbytesleft = outsize;
  310. return res;
  311. }
  312. static int
  313. mem_cd_iconveh_internal (const char *src, size_t srclen,
  314. iconv_t cd, iconv_t cd1, iconv_t cd2,
  315. enum iconv_ilseq_handler handler,
  316. size_t extra_alloc,
  317. size_t *offsets,
  318. char **resultp, size_t *lengthp)
  319. {
  320. /* When a conversion error occurs, we cannot start using CD1 and CD2 at
  321. this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
  322. Instead, we have to start afresh from the beginning of SRC. */
  323. /* Use a temporary buffer, so that for small strings, a single malloc()
  324. call will be sufficient. */
  325. # define tmpbufsize 4096
  326. /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
  327. libiconv's UCS-4-INTERNAL encoding. */
  328. union { unsigned int align; char buf[tmpbufsize]; } tmp;
  329. # define tmpbuf tmp.buf
  330. char *initial_result;
  331. char *result;
  332. size_t allocated;
  333. size_t length;
  334. size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
  335. if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
  336. {
  337. initial_result = *resultp;
  338. allocated = *lengthp;
  339. }
  340. else
  341. {
  342. initial_result = tmpbuf;
  343. allocated = sizeof (tmpbuf);
  344. }
  345. result = initial_result;
  346. /* Test whether a direct conversion is possible at all. */
  347. if (cd == (iconv_t)(-1))
  348. goto indirectly;
  349. if (offsets != NULL)
  350. {
  351. size_t i;
  352. for (i = 0; i < srclen; i++)
  353. offsets[i] = (size_t)(-1);
  354. last_length = (size_t)(-1);
  355. }
  356. length = 0;
  357. /* First, try a direct conversion, and see whether a conversion error
  358. occurs at all. */
  359. {
  360. const char *inptr = src;
  361. size_t insize = srclen;
  362. /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
  363. # if defined _LIBICONV_VERSION \
  364. || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
  365. /* Set to the initial state. */
  366. iconv (cd, NULL, NULL, NULL, NULL);
  367. # endif
  368. while (insize > 0)
  369. {
  370. char *outptr = result + length;
  371. size_t outsize = allocated - extra_alloc - length;
  372. bool incremented;
  373. size_t res;
  374. bool grow;
  375. if (offsets != NULL)
  376. {
  377. if (length != last_length) /* ensure that offset[] be increasing */
  378. {
  379. offsets[inptr - src] = length;
  380. last_length = length;
  381. }
  382. res = iconv_carefully_1 (cd,
  383. &inptr, &insize,
  384. &outptr, &outsize,
  385. &incremented);
  386. }
  387. else
  388. /* Use iconv_carefully instead of iconv here, because:
  389. - If TO_CODESET is UTF-8, we can do the error handling in this
  390. loop, no need for a second loop,
  391. - With iconv() implementations other than GNU libiconv and GNU
  392. libc, if we use iconv() in a big swoop, checking for an E2BIG
  393. return, we lose the number of irreversible conversions. */
  394. res = iconv_carefully (cd,
  395. &inptr, &insize,
  396. &outptr, &outsize,
  397. &incremented);
  398. length = outptr - result;
  399. grow = (length + extra_alloc > allocated / 2);
  400. if (res == (size_t)(-1))
  401. {
  402. if (errno == E2BIG)
  403. grow = true;
  404. else if (errno == EINVAL)
  405. break;
  406. else if (errno == EILSEQ && handler != iconveh_error)
  407. {
  408. if (cd2 == (iconv_t)(-1))
  409. {
  410. /* TO_CODESET is UTF-8. */
  411. /* Error handling can produce up to 1 byte of output. */
  412. if (length + 1 + extra_alloc > allocated)
  413. {
  414. char *memory;
  415. allocated = 2 * allocated;
  416. if (length + 1 + extra_alloc > allocated)
  417. abort ();
  418. if (result == initial_result)
  419. memory = (char *) malloc (allocated);
  420. else
  421. memory = (char *) realloc (result, allocated);
  422. if (memory == NULL)
  423. {
  424. if (result != initial_result)
  425. free (result);
  426. errno = ENOMEM;
  427. return -1;
  428. }
  429. if (result == initial_result)
  430. memcpy (memory, initial_result, length);
  431. result = memory;
  432. grow = false;
  433. }
  434. /* The input is invalid in FROM_CODESET. Eat up one byte
  435. and emit a question mark. */
  436. if (!incremented)
  437. {
  438. if (insize == 0)
  439. abort ();
  440. inptr++;
  441. insize--;
  442. }
  443. result[length] = '?';
  444. length++;
  445. }
  446. else
  447. goto indirectly;
  448. }
  449. else
  450. {
  451. if (result != initial_result)
  452. {
  453. int saved_errno = errno;
  454. free (result);
  455. errno = saved_errno;
  456. }
  457. return -1;
  458. }
  459. }
  460. if (insize == 0)
  461. break;
  462. if (grow)
  463. {
  464. char *memory;
  465. allocated = 2 * allocated;
  466. if (result == initial_result)
  467. memory = (char *) malloc (allocated);
  468. else
  469. memory = (char *) realloc (result, allocated);
  470. if (memory == NULL)
  471. {
  472. if (result != initial_result)
  473. free (result);
  474. errno = ENOMEM;
  475. return -1;
  476. }
  477. if (result == initial_result)
  478. memcpy (memory, initial_result, length);
  479. result = memory;
  480. }
  481. }
  482. }
  483. /* Now get the conversion state back to the initial state.
  484. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
  485. #if defined _LIBICONV_VERSION \
  486. || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
  487. for (;;)
  488. {
  489. char *outptr = result + length;
  490. size_t outsize = allocated - extra_alloc - length;
  491. size_t res;
  492. res = iconv (cd, NULL, NULL, &outptr, &outsize);
  493. length = outptr - result;
  494. if (res == (size_t)(-1))
  495. {
  496. if (errno == E2BIG)
  497. {
  498. char *memory;
  499. allocated = 2 * allocated;
  500. if (result == initial_result)
  501. memory = (char *) malloc (allocated);
  502. else
  503. memory = (char *) realloc (result, allocated);
  504. if (memory == NULL)
  505. {
  506. if (result != initial_result)
  507. free (result);
  508. errno = ENOMEM;
  509. return -1;
  510. }
  511. if (result == initial_result)
  512. memcpy (memory, initial_result, length);
  513. result = memory;
  514. }
  515. else
  516. {
  517. if (result != initial_result)
  518. {
  519. int saved_errno = errno;
  520. free (result);
  521. errno = saved_errno;
  522. }
  523. return -1;
  524. }
  525. }
  526. else
  527. break;
  528. }
  529. #endif
  530. /* The direct conversion succeeded. */
  531. goto done;
  532. indirectly:
  533. /* The direct conversion failed.
  534. Use a conversion through UTF-8. */
  535. if (offsets != NULL)
  536. {
  537. size_t i;
  538. for (i = 0; i < srclen; i++)
  539. offsets[i] = (size_t)(-1);
  540. last_length = (size_t)(-1);
  541. }
  542. length = 0;
  543. {
  544. const bool slowly = (offsets != NULL || handler == iconveh_error);
  545. # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
  546. char utf8buf[utf8bufsize + 1];
  547. size_t utf8len = 0;
  548. const char *in1ptr = src;
  549. size_t in1size = srclen;
  550. bool do_final_flush1 = true;
  551. bool do_final_flush2 = true;
  552. /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
  553. # if defined _LIBICONV_VERSION \
  554. || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
  555. /* Set to the initial state. */
  556. if (cd1 != (iconv_t)(-1))
  557. iconv (cd1, NULL, NULL, NULL, NULL);
  558. if (cd2 != (iconv_t)(-1))
  559. iconv (cd2, NULL, NULL, NULL, NULL);
  560. # endif
  561. while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
  562. {
  563. char *out1ptr = utf8buf + utf8len;
  564. size_t out1size = utf8bufsize - utf8len;
  565. bool incremented1;
  566. size_t res1;
  567. int errno1;
  568. /* Conversion step 1: from FROM_CODESET to UTF-8. */
  569. if (in1size > 0)
  570. {
  571. if (offsets != NULL
  572. && length != last_length) /* ensure that offset[] be increasing */
  573. {
  574. offsets[in1ptr - src] = length;
  575. last_length = length;
  576. }
  577. if (cd1 != (iconv_t)(-1))
  578. {
  579. if (slowly)
  580. res1 = iconv_carefully_1 (cd1,
  581. &in1ptr, &in1size,
  582. &out1ptr, &out1size,
  583. &incremented1);
  584. else
  585. res1 = iconv_carefully (cd1,
  586. &in1ptr, &in1size,
  587. &out1ptr, &out1size,
  588. &incremented1);
  589. }
  590. else
  591. {
  592. /* FROM_CODESET is UTF-8. */
  593. res1 = utf8conv_carefully (slowly,
  594. &in1ptr, &in1size,
  595. &out1ptr, &out1size,
  596. &incremented1);
  597. }
  598. }
  599. else if (do_final_flush1)
  600. {
  601. /* Now get the conversion state of CD1 back to the initial state.
  602. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
  603. # if defined _LIBICONV_VERSION \
  604. || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
  605. if (cd1 != (iconv_t)(-1))
  606. res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
  607. else
  608. # endif
  609. res1 = 0;
  610. do_final_flush1 = false;
  611. incremented1 = true;
  612. }
  613. else
  614. {
  615. res1 = 0;
  616. incremented1 = true;
  617. }
  618. if (res1 == (size_t)(-1)
  619. && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
  620. {
  621. if (result != initial_result)
  622. {
  623. int saved_errno = errno;
  624. free (result);
  625. errno = saved_errno;
  626. }
  627. return -1;
  628. }
  629. if (res1 == (size_t)(-1)
  630. && errno == EILSEQ && handler != iconveh_error)
  631. {
  632. /* The input is invalid in FROM_CODESET. Eat up one byte and
  633. emit a question mark. Room for the question mark was allocated
  634. at the end of utf8buf. */
  635. if (!incremented1)
  636. {
  637. if (in1size == 0)
  638. abort ();
  639. in1ptr++;
  640. in1size--;
  641. }
  642. *out1ptr++ = '?';
  643. res1 = 0;
  644. }
  645. errno1 = errno;
  646. utf8len = out1ptr - utf8buf;
  647. if (offsets != NULL
  648. || in1size == 0
  649. || utf8len > utf8bufsize / 2
  650. || (res1 == (size_t)(-1) && errno1 == E2BIG))
  651. {
  652. /* Conversion step 2: from UTF-8 to TO_CODESET. */
  653. const char *in2ptr = utf8buf;
  654. size_t in2size = utf8len;
  655. while (in2size > 0
  656. || (in1size == 0 && !do_final_flush1 && do_final_flush2))
  657. {
  658. char *out2ptr = result + length;
  659. size_t out2size = allocated - extra_alloc - length;
  660. bool incremented2;
  661. size_t res2;
  662. bool grow;
  663. if (in2size > 0)
  664. {
  665. if (cd2 != (iconv_t)(-1))
  666. res2 = iconv_carefully (cd2,
  667. &in2ptr, &in2size,
  668. &out2ptr, &out2size,
  669. &incremented2);
  670. else
  671. /* TO_CODESET is UTF-8. */
  672. res2 = utf8conv_carefully (false,
  673. &in2ptr, &in2size,
  674. &out2ptr, &out2size,
  675. &incremented2);
  676. }
  677. else /* in1size == 0 && !do_final_flush1
  678. && in2size == 0 && do_final_flush2 */
  679. {
  680. /* Now get the conversion state of CD1 back to the initial
  681. state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
  682. # if defined _LIBICONV_VERSION \
  683. || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
  684. if (cd2 != (iconv_t)(-1))
  685. res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
  686. else
  687. # endif
  688. res2 = 0;
  689. do_final_flush2 = false;
  690. incremented2 = true;
  691. }
  692. length = out2ptr - result;
  693. grow = (length + extra_alloc > allocated / 2);
  694. if (res2 == (size_t)(-1))
  695. {
  696. if (errno == E2BIG)
  697. grow = true;
  698. else if (errno == EINVAL)
  699. break;
  700. else if (errno == EILSEQ && handler != iconveh_error)
  701. {
  702. /* Error handling can produce up to 10 bytes of ASCII
  703. output. But TO_CODESET may be UCS-2, UTF-16 or
  704. UCS-4, so use CD2 here as well. */
  705. char scratchbuf[10];
  706. size_t scratchlen;
  707. ucs4_t uc;
  708. const char *inptr;
  709. size_t insize;
  710. size_t res;
  711. if (incremented2)
  712. {
  713. if (u8_prev (&uc, (const uint8_t *) in2ptr,
  714. (const uint8_t *) utf8buf)
  715. == NULL)
  716. abort ();
  717. }
  718. else
  719. {
  720. int n;
  721. if (in2size == 0)
  722. abort ();
  723. n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
  724. in2size);
  725. in2ptr += n;
  726. in2size -= n;
  727. }
  728. if (handler == iconveh_escape_sequence)
  729. {
  730. static char hex[16] = "0123456789ABCDEF";
  731. scratchlen = 0;
  732. scratchbuf[scratchlen++] = '\\';
  733. if (uc < 0x10000)
  734. scratchbuf[scratchlen++] = 'u';
  735. else
  736. {
  737. scratchbuf[scratchlen++] = 'U';
  738. scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
  739. scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
  740. scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
  741. scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
  742. }
  743. scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
  744. scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
  745. scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
  746. scratchbuf[scratchlen++] = hex[uc & 15];
  747. }
  748. else
  749. {
  750. scratchbuf[0] = '?';
  751. scratchlen = 1;
  752. }
  753. inptr = scratchbuf;
  754. insize = scratchlen;
  755. if (cd2 != (iconv_t)(-1))
  756. res = iconv (cd2,
  757. (ICONV_CONST char **) &inptr, &insize,
  758. &out2ptr, &out2size);
  759. else
  760. {
  761. /* TO_CODESET is UTF-8. */
  762. if (out2size >= insize)
  763. {
  764. memcpy (out2ptr, inptr, insize);
  765. out2ptr += insize;
  766. out2size -= insize;
  767. inptr += insize;
  768. insize = 0;
  769. res = 0;
  770. }
  771. else
  772. {
  773. errno = E2BIG;
  774. res = (size_t)(-1);
  775. }
  776. }
  777. length = out2ptr - result;
  778. if (res == (size_t)(-1) && errno == E2BIG)
  779. {
  780. char *memory;
  781. allocated = 2 * allocated;
  782. if (length + 1 + extra_alloc > allocated)
  783. abort ();
  784. if (result == initial_result)
  785. memory = (char *) malloc (allocated);
  786. else
  787. memory = (char *) realloc (result, allocated);
  788. if (memory == NULL)
  789. {
  790. if (result != initial_result)
  791. free (result);
  792. errno = ENOMEM;
  793. return -1;
  794. }
  795. if (result == initial_result)
  796. memcpy (memory, initial_result, length);
  797. result = memory;
  798. grow = false;
  799. out2ptr = result + length;
  800. out2size = allocated - extra_alloc - length;
  801. if (cd2 != (iconv_t)(-1))
  802. res = iconv (cd2,
  803. (ICONV_CONST char **) &inptr,
  804. &insize,
  805. &out2ptr, &out2size);
  806. else
  807. {
  808. /* TO_CODESET is UTF-8. */
  809. if (!(out2size >= insize))
  810. abort ();
  811. memcpy (out2ptr, inptr, insize);
  812. out2ptr += insize;
  813. out2size -= insize;
  814. inptr += insize;
  815. insize = 0;
  816. res = 0;
  817. }
  818. length = out2ptr - result;
  819. }
  820. # if !defined _LIBICONV_VERSION && !defined __GLIBC__
  821. /* Irix iconv() inserts a NUL byte if it cannot convert.
  822. NetBSD iconv() inserts a question mark if it cannot
  823. convert.
  824. Only GNU libiconv and GNU libc are known to prefer
  825. to fail rather than doing a lossy conversion. */
  826. if (res != (size_t)(-1) && res > 0)
  827. {
  828. errno = EILSEQ;
  829. res = (size_t)(-1);
  830. }
  831. # endif
  832. if (res == (size_t)(-1))
  833. {
  834. /* Failure converting the ASCII replacement. */
  835. if (result != initial_result)
  836. {
  837. int saved_errno = errno;
  838. free (result);
  839. errno = saved_errno;
  840. }
  841. return -1;
  842. }
  843. }
  844. else
  845. {
  846. if (result != initial_result)
  847. {
  848. int saved_errno = errno;
  849. free (result);
  850. errno = saved_errno;
  851. }
  852. return -1;
  853. }
  854. }
  855. if (!(in2size > 0
  856. || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
  857. break;
  858. if (grow)
  859. {
  860. char *memory;
  861. allocated = 2 * allocated;
  862. if (result == initial_result)
  863. memory = (char *) malloc (allocated);
  864. else
  865. memory = (char *) realloc (result, allocated);
  866. if (memory == NULL)
  867. {
  868. if (result != initial_result)
  869. free (result);
  870. errno = ENOMEM;
  871. return -1;
  872. }
  873. if (result == initial_result)
  874. memcpy (memory, initial_result, length);
  875. result = memory;
  876. }
  877. }
  878. /* Move the remaining bytes to the beginning of utf8buf. */
  879. if (in2size > 0)
  880. memmove (utf8buf, in2ptr, in2size);
  881. utf8len = in2size;
  882. }
  883. if (res1 == (size_t)(-1))
  884. {
  885. if (errno1 == EINVAL)
  886. in1size = 0;
  887. else if (errno1 == EILSEQ)
  888. {
  889. if (result != initial_result)
  890. free (result);
  891. errno = errno1;
  892. return -1;
  893. }
  894. }
  895. }
  896. # undef utf8bufsize
  897. }
  898. done:
  899. /* Now the final memory allocation. */
  900. if (result == tmpbuf)
  901. {
  902. size_t memsize = length + extra_alloc;
  903. char *memory;
  904. memory = (char *) malloc (memsize > 0 ? memsize : 1);
  905. if (memory != NULL)
  906. {
  907. memcpy (memory, tmpbuf, length);
  908. result = memory;
  909. }
  910. else
  911. {
  912. errno = ENOMEM;
  913. return -1;
  914. }
  915. }
  916. else if (result != *resultp && length + extra_alloc < allocated)
  917. {
  918. /* Shrink the allocated memory if possible. */
  919. size_t memsize = length + extra_alloc;
  920. char *memory;
  921. memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
  922. if (memory != NULL)
  923. result = memory;
  924. }
  925. *resultp = result;
  926. *lengthp = length;
  927. return 0;
  928. # undef tmpbuf
  929. # undef tmpbufsize
  930. }
  931. int
  932. mem_cd_iconveh (const char *src, size_t srclen,
  933. const iconveh_t *cd,
  934. enum iconv_ilseq_handler handler,
  935. size_t *offsets,
  936. char **resultp, size_t *lengthp)
  937. {
  938. return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
  939. handler, 0, offsets, resultp, lengthp);
  940. }
  941. char *
  942. str_cd_iconveh (const char *src,
  943. const iconveh_t *cd,
  944. enum iconv_ilseq_handler handler)
  945. {
  946. /* For most encodings, a trailing NUL byte in the input will be converted
  947. to a trailing NUL byte in the output. But not for UTF-7. So that this
  948. function is usable for UTF-7, we have to exclude the NUL byte from the
  949. conversion and add it by hand afterwards. */
  950. char *result = NULL;
  951. size_t length = 0;
  952. int retval = mem_cd_iconveh_internal (src, strlen (src),
  953. cd->cd, cd->cd1, cd->cd2, handler, 1,
  954. NULL, &result, &length);
  955. if (retval < 0)
  956. {
  957. if (result != NULL)
  958. {
  959. int saved_errno = errno;
  960. free (result);
  961. errno = saved_errno;
  962. }
  963. return NULL;
  964. }
  965. /* Add the terminating NUL byte. */
  966. result[length] = '\0';
  967. return result;
  968. }
  969. #endif
  970. int
  971. mem_iconveh (const char *src, size_t srclen,
  972. const char *from_codeset, const char *to_codeset,
  973. enum iconv_ilseq_handler handler,
  974. size_t *offsets,
  975. char **resultp, size_t *lengthp)
  976. {
  977. if (srclen == 0)
  978. {
  979. /* Nothing to convert. */
  980. *lengthp = 0;
  981. return 0;
  982. }
  983. else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
  984. {
  985. char *result;
  986. if (*resultp != NULL && *lengthp >= srclen)
  987. result = *resultp;
  988. else
  989. {
  990. result = (char *) malloc (srclen);
  991. if (result == NULL)
  992. {
  993. errno = ENOMEM;
  994. return -1;
  995. }
  996. }
  997. memcpy (result, src, srclen);
  998. *resultp = result;
  999. *lengthp = srclen;
  1000. return 0;
  1001. }
  1002. else
  1003. {
  1004. #if HAVE_ICONV
  1005. iconveh_t cd;
  1006. char *result;
  1007. size_t length;
  1008. int retval;
  1009. if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
  1010. return -1;
  1011. result = *resultp;
  1012. length = *lengthp;
  1013. retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
  1014. &result, &length);
  1015. if (retval < 0)
  1016. {
  1017. /* Close cd, but preserve the errno from str_cd_iconv. */
  1018. int saved_errno = errno;
  1019. iconveh_close (&cd);
  1020. errno = saved_errno;
  1021. }
  1022. else
  1023. {
  1024. if (iconveh_close (&cd) < 0)
  1025. {
  1026. /* Return -1, but free the allocated memory, and while doing
  1027. that, preserve the errno from iconveh_close. */
  1028. int saved_errno = errno;
  1029. if (result != *resultp && result != NULL)
  1030. free (result);
  1031. errno = saved_errno;
  1032. return -1;
  1033. }
  1034. *resultp = result;
  1035. *lengthp = length;
  1036. }
  1037. return retval;
  1038. #else
  1039. /* This is a different error code than if iconv_open existed but didn't
  1040. support from_codeset and to_codeset, so that the caller can emit
  1041. an error message such as
  1042. "iconv() is not supported. Installing GNU libiconv and
  1043. then reinstalling this package would fix this." */
  1044. errno = ENOSYS;
  1045. return -1;
  1046. #endif
  1047. }
  1048. }
  1049. char *
  1050. str_iconveh (const char *src,
  1051. const char *from_codeset, const char *to_codeset,
  1052. enum iconv_ilseq_handler handler)
  1053. {
  1054. if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
  1055. {
  1056. char *result = strdup (src);
  1057. if (result == NULL)
  1058. errno = ENOMEM;
  1059. return result;
  1060. }
  1061. else
  1062. {
  1063. #if HAVE_ICONV
  1064. iconveh_t cd;
  1065. char *result;
  1066. if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
  1067. return NULL;
  1068. result = str_cd_iconveh (src, &cd, handler);
  1069. if (result == NULL)
  1070. {
  1071. /* Close cd, but preserve the errno from str_cd_iconv. */
  1072. int saved_errno = errno;
  1073. iconveh_close (&cd);
  1074. errno = saved_errno;
  1075. }
  1076. else
  1077. {
  1078. if (iconveh_close (&cd) < 0)
  1079. {
  1080. /* Return NULL, but free the allocated memory, and while doing
  1081. that, preserve the errno from iconveh_close. */
  1082. int saved_errno = errno;
  1083. free (result);
  1084. errno = saved_errno;
  1085. return NULL;
  1086. }
  1087. }
  1088. return result;
  1089. #else
  1090. /* This is a different error code than if iconv_open existed but didn't
  1091. support from_codeset and to_codeset, so that the caller can emit
  1092. an error message such as
  1093. "iconv() is not supported. Installing GNU libiconv and
  1094. then reinstalling this package would fix this." */
  1095. errno = ENOSYS;
  1096. return NULL;
  1097. #endif
  1098. }
  1099. }