striconveh.c 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236
  1. /* Character set conversion with error handling.
  2. Copyright (C) 2001-2023 Free Software Foundation, Inc.
  3. Written by Bruno Haible and Simon Josefsson.
  4. This file is free software: you can redistribute it and/or modify
  5. it under the terms of the GNU Lesser General Public License as
  6. published by the Free Software Foundation; either version 2.1 of the
  7. License, or (at your option) any later version.
  8. This file is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public License
  13. along with this program. If not, see <https://www.gnu.org/licenses/>. */
  14. #include <config.h>
  15. /* Specification. */
  16. #include "striconveh.h"
  17. #include <errno.h>
  18. #include <stdlib.h>
  19. #include <string.h>
  20. #if HAVE_ICONV
  21. # include <iconv.h>
  22. # include "unistr.h"
  23. #endif
  24. #include "c-strcase.h"
  25. #include "c-strcaseeq.h"
  26. #ifndef SIZE_MAX
  27. # define SIZE_MAX ((size_t) -1)
  28. #endif
  29. #if HAVE_ICONV
  30. /* The caller must provide an iconveh_t, not just an iconv_t, because when a
  31. conversion error occurs, we may have to determine the Unicode representation
  32. of the inconvertible character. */
  33. int
  34. iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
  35. {
  36. iconv_t cd;
  37. iconv_t cd1;
  38. iconv_t cd2;
  39. /* Avoid glibc-2.1 bug with EUC-KR. */
  40. # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  41. && !defined _LIBICONV_VERSION
  42. if (c_strcasecmp (from_codeset, "EUC-KR") == 0
  43. || c_strcasecmp (to_codeset, "EUC-KR") == 0)
  44. {
  45. errno = EINVAL;
  46. return -1;
  47. }
  48. # endif
  49. cd = iconv_open (to_codeset, from_codeset);
  50. if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
  51. cd1 = (iconv_t)(-1);
  52. else
  53. {
  54. cd1 = iconv_open ("UTF-8", from_codeset);
  55. if (cd1 == (iconv_t)(-1))
  56. {
  57. int saved_errno = errno;
  58. if (cd != (iconv_t)(-1))
  59. iconv_close (cd);
  60. errno = saved_errno;
  61. return -1;
  62. }
  63. }
  64. if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
  65. # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
  66. && !defined __UCLIBC__) \
  67. || _LIBICONV_VERSION >= 0x0105
  68. || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
  69. # endif
  70. )
  71. cd2 = (iconv_t)(-1);
  72. else
  73. {
  74. cd2 = iconv_open (to_codeset, "UTF-8");
  75. if (cd2 == (iconv_t)(-1))
  76. {
  77. int saved_errno = errno;
  78. if (cd1 != (iconv_t)(-1))
  79. iconv_close (cd1);
  80. if (cd != (iconv_t)(-1))
  81. iconv_close (cd);
  82. errno = saved_errno;
  83. return -1;
  84. }
  85. }
  86. cdp->cd = cd;
  87. cdp->cd1 = cd1;
  88. cdp->cd2 = cd2;
  89. return 0;
  90. }
  91. int
  92. iconveh_close (const iconveh_t *cd)
  93. {
  94. if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
  95. {
  96. /* Return -1, but preserve the errno from iconv_close. */
  97. int saved_errno = errno;
  98. if (cd->cd1 != (iconv_t)(-1))
  99. iconv_close (cd->cd1);
  100. if (cd->cd != (iconv_t)(-1))
  101. iconv_close (cd->cd);
  102. errno = saved_errno;
  103. return -1;
  104. }
  105. if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
  106. {
  107. /* Return -1, but preserve the errno from iconv_close. */
  108. int saved_errno = errno;
  109. if (cd->cd != (iconv_t)(-1))
  110. iconv_close (cd->cd);
  111. errno = saved_errno;
  112. return -1;
  113. }
  114. if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
  115. return -1;
  116. return 0;
  117. }
  118. /* iconv_carefully is like iconv, except that it stops as soon as it encounters
  119. a conversion error, and it returns in *INCREMENTED a boolean telling whether
  120. it has incremented the input pointers past the error location. */
  121. # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
  122. /* Irix iconv() inserts a NUL byte if it cannot convert.
  123. NetBSD iconv() inserts a question mark if it cannot convert.
  124. Only GNU libiconv and GNU libc are known to prefer to fail rather
  125. than doing a lossy conversion. */
  126. static size_t
  127. iconv_carefully (iconv_t cd,
  128. const char **inbuf, size_t *inbytesleft,
  129. char **outbuf, size_t *outbytesleft,
  130. bool *incremented)
  131. {
  132. const char *inptr = *inbuf;
  133. const char *inptr_end = inptr + *inbytesleft;
  134. char *outptr = *outbuf;
  135. size_t outsize = *outbytesleft;
  136. const char *inptr_before;
  137. size_t res;
  138. do
  139. {
  140. size_t insize;
  141. inptr_before = inptr;
  142. res = (size_t)(-1);
  143. for (insize = 1; inptr + insize <= inptr_end; insize++)
  144. {
  145. res = iconv (cd,
  146. (ICONV_CONST char **) &inptr, &insize,
  147. &outptr, &outsize);
  148. if (!(res == (size_t)(-1) && errno == EINVAL))
  149. break;
  150. /* iconv can eat up a shift sequence but give EINVAL while attempting
  151. to convert the first character. E.g. libiconv does this. */
  152. if (inptr > inptr_before)
  153. {
  154. res = 0;
  155. break;
  156. }
  157. }
  158. if (res == 0)
  159. {
  160. *outbuf = outptr;
  161. *outbytesleft = outsize;
  162. }
  163. }
  164. while (res == 0 && inptr < inptr_end);
  165. *inbuf = inptr;
  166. *inbytesleft = inptr_end - inptr;
  167. if (res != (size_t)(-1) && res > 0)
  168. {
  169. /* iconv() has already incremented INPTR. We cannot go back to a
  170. previous INPTR, otherwise the state inside CD would become invalid,
  171. if FROM_CODESET is a stateful encoding. So, tell the caller that
  172. *INBUF has already been incremented. */
  173. *incremented = (inptr > inptr_before);
  174. errno = EILSEQ;
  175. return (size_t)(-1);
  176. }
  177. else
  178. {
  179. *incremented = false;
  180. return res;
  181. }
  182. }
  183. # else
  184. # define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
  185. (*(incremented) = false, \
  186. iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
  187. # endif
  188. /* iconv_carefully_1 is like iconv_carefully, except that it stops after
  189. converting one character or one shift sequence. */
  190. static size_t
  191. iconv_carefully_1 (iconv_t cd,
  192. const char **inbuf, size_t *inbytesleft,
  193. char **outbuf, size_t *outbytesleft,
  194. bool *incremented)
  195. {
  196. const char *inptr_before = *inbuf;
  197. const char *inptr = inptr_before;
  198. const char *inptr_end = inptr_before + *inbytesleft;
  199. char *outptr = *outbuf;
  200. size_t outsize = *outbytesleft;
  201. size_t res = (size_t)(-1);
  202. size_t insize;
  203. for (insize = 1; inptr_before + insize <= inptr_end; insize++)
  204. {
  205. inptr = inptr_before;
  206. res = iconv (cd,
  207. (ICONV_CONST char **) &inptr, &insize,
  208. &outptr, &outsize);
  209. if (!(res == (size_t)(-1) && errno == EINVAL))
  210. break;
  211. /* iconv can eat up a shift sequence but give EINVAL while attempting
  212. to convert the first character. E.g. libiconv does this. */
  213. if (inptr > inptr_before)
  214. {
  215. res = 0;
  216. break;
  217. }
  218. }
  219. *inbuf = inptr;
  220. *inbytesleft = inptr_end - inptr;
  221. # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
  222. /* Irix iconv() inserts a NUL byte if it cannot convert.
  223. NetBSD iconv() inserts a question mark if it cannot convert.
  224. Only GNU libiconv and GNU libc are known to prefer to fail rather
  225. than doing a lossy conversion. */
  226. if (res != (size_t)(-1) && res > 0)
  227. {
  228. /* iconv() has already incremented INPTR. We cannot go back to a
  229. previous INPTR, otherwise the state inside CD would become invalid,
  230. if FROM_CODESET is a stateful encoding. So, tell the caller that
  231. *INBUF has already been incremented. */
  232. *incremented = (inptr > inptr_before);
  233. errno = EILSEQ;
  234. return (size_t)(-1);
  235. }
  236. # endif
  237. if (res != (size_t)(-1))
  238. {
  239. *outbuf = outptr;
  240. *outbytesleft = outsize;
  241. }
  242. *incremented = false;
  243. return res;
  244. }
  245. /* utf8conv_carefully is like iconv, except that
  246. - it converts from UTF-8 to UTF-8,
  247. - it stops as soon as it encounters a conversion error, and it returns
  248. in *INCREMENTED a boolean telling whether it has incremented the input
  249. pointers past the error location,
  250. - if one_character_only is true, it stops after converting one
  251. character. */
  252. static size_t
  253. utf8conv_carefully (bool one_character_only,
  254. const char **inbuf, size_t *inbytesleft,
  255. char **outbuf, size_t *outbytesleft,
  256. bool *incremented)
  257. {
  258. const char *inptr = *inbuf;
  259. size_t insize = *inbytesleft;
  260. char *outptr = *outbuf;
  261. size_t outsize = *outbytesleft;
  262. size_t res;
  263. res = 0;
  264. do
  265. {
  266. ucs4_t uc;
  267. int n;
  268. int m;
  269. n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
  270. if (n < 0)
  271. {
  272. errno = (n == -2 ? EINVAL : EILSEQ);
  273. n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
  274. inptr += n;
  275. insize -= n;
  276. res = (size_t)(-1);
  277. *incremented = true;
  278. break;
  279. }
  280. if (outsize == 0)
  281. {
  282. errno = E2BIG;
  283. res = (size_t)(-1);
  284. *incremented = false;
  285. break;
  286. }
  287. m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
  288. if (m == -2)
  289. {
  290. errno = E2BIG;
  291. res = (size_t)(-1);
  292. *incremented = false;
  293. break;
  294. }
  295. inptr += n;
  296. insize -= n;
  297. if (m == -1)
  298. {
  299. errno = EILSEQ;
  300. res = (size_t)(-1);
  301. *incremented = true;
  302. break;
  303. }
  304. outptr += m;
  305. outsize -= m;
  306. }
  307. while (!one_character_only && insize > 0);
  308. *inbuf = inptr;
  309. *inbytesleft = insize;
  310. *outbuf = outptr;
  311. *outbytesleft = outsize;
  312. return res;
  313. }
  314. static int
  315. mem_cd_iconveh_internal (const char *src, size_t srclen,
  316. iconv_t cd, iconv_t cd1, iconv_t cd2,
  317. enum iconv_ilseq_handler handler,
  318. size_t extra_alloc,
  319. size_t *offsets,
  320. char **resultp, size_t *lengthp)
  321. {
  322. /* When a conversion error occurs, we cannot start using CD1 and CD2 at
  323. this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
  324. Instead, we have to start afresh from the beginning of SRC. */
  325. /* Use a temporary buffer, so that for small strings, a single malloc()
  326. call will be sufficient. */
  327. # define tmpbufsize 4096
  328. /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
  329. libiconv's UCS-4-INTERNAL encoding. */
  330. union { unsigned int align; char buf[tmpbufsize]; } tmp;
  331. # define tmpbuf tmp.buf
  332. char *initial_result;
  333. char *result;
  334. size_t allocated;
  335. size_t length;
  336. size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
  337. if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
  338. {
  339. initial_result = *resultp;
  340. allocated = *lengthp;
  341. }
  342. else
  343. {
  344. initial_result = tmpbuf;
  345. allocated = sizeof (tmpbuf);
  346. }
  347. result = initial_result;
  348. /* Test whether a direct conversion is possible at all. */
  349. if (cd == (iconv_t)(-1))
  350. goto indirectly;
  351. if (offsets != NULL)
  352. {
  353. size_t i;
  354. for (i = 0; i < srclen; i++)
  355. offsets[i] = (size_t)(-1);
  356. last_length = (size_t)(-1);
  357. }
  358. length = 0;
  359. /* First, try a direct conversion, and see whether a conversion error
  360. occurs at all. */
  361. {
  362. const char *inptr = src;
  363. size_t insize = srclen;
  364. /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
  365. # if defined _LIBICONV_VERSION \
  366. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  367. || defined __sun)
  368. /* Set to the initial state. */
  369. iconv (cd, NULL, NULL, NULL, NULL);
  370. # endif
  371. while (insize > 0)
  372. {
  373. char *outptr = result + length;
  374. size_t outsize = allocated - extra_alloc - length;
  375. bool incremented;
  376. size_t res;
  377. bool grow;
  378. if (offsets != NULL)
  379. {
  380. if (length != last_length) /* ensure that offset[] be increasing */
  381. {
  382. offsets[inptr - src] = length;
  383. last_length = length;
  384. }
  385. res = iconv_carefully_1 (cd,
  386. &inptr, &insize,
  387. &outptr, &outsize,
  388. &incremented);
  389. }
  390. else
  391. /* Use iconv_carefully instead of iconv here, because:
  392. - If TO_CODESET is UTF-8, we can do the error handling in this
  393. loop, no need for a second loop,
  394. - With iconv() implementations other than GNU libiconv and GNU
  395. libc, if we use iconv() in a big swoop, checking for an E2BIG
  396. return, we lose the number of irreversible conversions. */
  397. res = iconv_carefully (cd,
  398. &inptr, &insize,
  399. &outptr, &outsize,
  400. &incremented);
  401. length = outptr - result;
  402. grow = (length + extra_alloc > allocated / 2);
  403. if (res == (size_t)(-1))
  404. {
  405. if (errno == E2BIG)
  406. grow = true;
  407. else if (errno == EINVAL)
  408. break;
  409. else if (errno == EILSEQ && handler != iconveh_error)
  410. {
  411. if (cd2 == (iconv_t)(-1))
  412. {
  413. /* TO_CODESET is UTF-8. */
  414. /* Error handling can produce up to 1 or 3 bytes of
  415. output. */
  416. size_t extra_need =
  417. (handler == iconveh_replacement_character ? 3 : 1);
  418. if (length + extra_need + extra_alloc > allocated)
  419. {
  420. char *memory;
  421. allocated = 2 * allocated;
  422. if (length + extra_need + extra_alloc > allocated)
  423. allocated = 2 * allocated;
  424. if (length + extra_need + extra_alloc > allocated)
  425. abort ();
  426. if (result == initial_result)
  427. memory = (char *) malloc (allocated);
  428. else
  429. memory = (char *) realloc (result, allocated);
  430. if (memory == NULL)
  431. {
  432. if (result != initial_result)
  433. free (result);
  434. errno = ENOMEM;
  435. return -1;
  436. }
  437. if (result == initial_result)
  438. memcpy (memory, initial_result, length);
  439. result = memory;
  440. grow = false;
  441. }
  442. /* The input is invalid in FROM_CODESET. Eat up one byte
  443. and emit a replacement character or a question mark. */
  444. if (!incremented)
  445. {
  446. if (insize == 0)
  447. abort ();
  448. inptr++;
  449. insize--;
  450. }
  451. if (handler == iconveh_replacement_character)
  452. {
  453. /* U+FFFD in UTF-8 encoding. */
  454. result[length+0] = '\357';
  455. result[length+1] = '\277';
  456. result[length+2] = '\275';
  457. length += 3;
  458. }
  459. else
  460. {
  461. result[length] = '?';
  462. length++;
  463. }
  464. }
  465. else
  466. goto indirectly;
  467. }
  468. else
  469. {
  470. if (result != initial_result)
  471. free (result);
  472. return -1;
  473. }
  474. }
  475. if (insize == 0)
  476. break;
  477. if (grow)
  478. {
  479. char *memory;
  480. allocated = 2 * allocated;
  481. if (result == initial_result)
  482. memory = (char *) malloc (allocated);
  483. else
  484. memory = (char *) realloc (result, allocated);
  485. if (memory == NULL)
  486. {
  487. if (result != initial_result)
  488. free (result);
  489. errno = ENOMEM;
  490. return -1;
  491. }
  492. if (result == initial_result)
  493. memcpy (memory, initial_result, length);
  494. result = memory;
  495. }
  496. }
  497. }
  498. /* Now get the conversion state back to the initial state.
  499. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
  500. #if defined _LIBICONV_VERSION \
  501. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  502. || defined __sun)
  503. for (;;)
  504. {
  505. char *outptr = result + length;
  506. size_t outsize = allocated - extra_alloc - length;
  507. size_t res;
  508. res = iconv (cd, NULL, NULL, &outptr, &outsize);
  509. length = outptr - result;
  510. if (res == (size_t)(-1))
  511. {
  512. if (errno == E2BIG)
  513. {
  514. char *memory;
  515. allocated = 2 * allocated;
  516. if (result == initial_result)
  517. memory = (char *) malloc (allocated);
  518. else
  519. memory = (char *) realloc (result, allocated);
  520. if (memory == NULL)
  521. {
  522. if (result != initial_result)
  523. free (result);
  524. errno = ENOMEM;
  525. return -1;
  526. }
  527. if (result == initial_result)
  528. memcpy (memory, initial_result, length);
  529. result = memory;
  530. }
  531. else
  532. {
  533. if (result != initial_result)
  534. free (result);
  535. return -1;
  536. }
  537. }
  538. else
  539. break;
  540. }
  541. #endif
  542. /* The direct conversion succeeded. */
  543. goto done;
  544. indirectly:
  545. /* The direct conversion failed.
  546. Use a conversion through UTF-8. */
  547. if (offsets != NULL)
  548. {
  549. size_t i;
  550. for (i = 0; i < srclen; i++)
  551. offsets[i] = (size_t)(-1);
  552. last_length = (size_t)(-1);
  553. }
  554. length = 0;
  555. {
  556. const bool slowly = (offsets != NULL || handler == iconveh_error);
  557. # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
  558. char utf8buf[utf8bufsize + 3];
  559. size_t utf8len = 0;
  560. const char *in1ptr = src;
  561. size_t in1size = srclen;
  562. bool do_final_flush1 = true;
  563. bool do_final_flush2 = true;
  564. /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
  565. # if defined _LIBICONV_VERSION \
  566. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  567. || defined __sun)
  568. /* Set to the initial state. */
  569. if (cd1 != (iconv_t)(-1))
  570. iconv (cd1, NULL, NULL, NULL, NULL);
  571. if (cd2 != (iconv_t)(-1))
  572. iconv (cd2, NULL, NULL, NULL, NULL);
  573. # endif
  574. while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
  575. {
  576. char *out1ptr = utf8buf + utf8len;
  577. size_t out1size = utf8bufsize - utf8len;
  578. bool incremented1;
  579. size_t res1;
  580. int errno1;
  581. /* Conversion step 1: from FROM_CODESET to UTF-8. */
  582. if (in1size > 0)
  583. {
  584. if (offsets != NULL
  585. && length != last_length) /* ensure that offset[] be increasing */
  586. {
  587. offsets[in1ptr - src] = length;
  588. last_length = length;
  589. }
  590. if (cd1 != (iconv_t)(-1))
  591. {
  592. if (slowly)
  593. res1 = iconv_carefully_1 (cd1,
  594. &in1ptr, &in1size,
  595. &out1ptr, &out1size,
  596. &incremented1);
  597. else
  598. res1 = iconv_carefully (cd1,
  599. &in1ptr, &in1size,
  600. &out1ptr, &out1size,
  601. &incremented1);
  602. }
  603. else
  604. {
  605. /* FROM_CODESET is UTF-8. */
  606. res1 = utf8conv_carefully (slowly,
  607. &in1ptr, &in1size,
  608. &out1ptr, &out1size,
  609. &incremented1);
  610. }
  611. }
  612. else if (do_final_flush1)
  613. {
  614. /* Now get the conversion state of CD1 back to the initial state.
  615. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
  616. # if defined _LIBICONV_VERSION \
  617. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  618. || defined __sun)
  619. if (cd1 != (iconv_t)(-1))
  620. res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
  621. else
  622. # endif
  623. res1 = 0;
  624. do_final_flush1 = false;
  625. incremented1 = true;
  626. }
  627. else
  628. {
  629. res1 = 0;
  630. incremented1 = true;
  631. }
  632. if (res1 == (size_t)(-1)
  633. && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
  634. {
  635. if (result != initial_result)
  636. free (result);
  637. return -1;
  638. }
  639. if (res1 == (size_t)(-1)
  640. && errno == EILSEQ && handler != iconveh_error)
  641. {
  642. /* The input is invalid in FROM_CODESET. Eat up one byte and
  643. emit a U+FFFD character or a question mark. Room for this
  644. character was allocated at the end of utf8buf. */
  645. if (!incremented1)
  646. {
  647. if (in1size == 0)
  648. abort ();
  649. in1ptr++;
  650. in1size--;
  651. }
  652. if (handler == iconveh_replacement_character)
  653. {
  654. /* U+FFFD in UTF-8 encoding. */
  655. out1ptr[0] = '\357';
  656. out1ptr[1] = '\277';
  657. out1ptr[2] = '\275';
  658. out1ptr += 3;
  659. }
  660. else
  661. *out1ptr++ = '?';
  662. res1 = 0;
  663. }
  664. errno1 = errno;
  665. utf8len = out1ptr - utf8buf;
  666. if (offsets != NULL
  667. || in1size == 0
  668. || utf8len > utf8bufsize / 2
  669. || (res1 == (size_t)(-1) && errno1 == E2BIG))
  670. {
  671. /* Conversion step 2: from UTF-8 to TO_CODESET. */
  672. const char *in2ptr = utf8buf;
  673. size_t in2size = utf8len;
  674. while (in2size > 0
  675. || (in1size == 0 && !do_final_flush1 && do_final_flush2))
  676. {
  677. char *out2ptr = result + length;
  678. size_t out2size = allocated - extra_alloc - length;
  679. bool incremented2;
  680. size_t res2;
  681. bool grow;
  682. if (in2size > 0)
  683. {
  684. if (cd2 != (iconv_t)(-1))
  685. res2 = iconv_carefully (cd2,
  686. &in2ptr, &in2size,
  687. &out2ptr, &out2size,
  688. &incremented2);
  689. else
  690. /* TO_CODESET is UTF-8. */
  691. res2 = utf8conv_carefully (false,
  692. &in2ptr, &in2size,
  693. &out2ptr, &out2size,
  694. &incremented2);
  695. }
  696. else /* in1size == 0 && !do_final_flush1
  697. && in2size == 0 && do_final_flush2 */
  698. {
  699. /* Now get the conversion state of CD1 back to the initial
  700. state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
  701. # if defined _LIBICONV_VERSION \
  702. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  703. || defined __sun)
  704. if (cd2 != (iconv_t)(-1))
  705. res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
  706. else
  707. # endif
  708. res2 = 0;
  709. do_final_flush2 = false;
  710. incremented2 = true;
  711. }
  712. length = out2ptr - result;
  713. grow = (length + extra_alloc > allocated / 2);
  714. if (res2 == (size_t)(-1))
  715. {
  716. if (errno == E2BIG)
  717. grow = true;
  718. else if (errno == EINVAL)
  719. break;
  720. else if (errno == EILSEQ && handler != iconveh_error)
  721. {
  722. /* Error handling can produce up to 10 bytes of UTF-8
  723. output. But TO_CODESET may be UCS-2, UTF-16 or
  724. UCS-4, so use CD2 here as well. */
  725. char scratchbuf[10];
  726. size_t scratchlen;
  727. ucs4_t uc;
  728. const char *inptr;
  729. size_t insize;
  730. size_t res;
  731. if (incremented2)
  732. {
  733. if (u8_prev (&uc, (const uint8_t *) in2ptr,
  734. (const uint8_t *) utf8buf)
  735. == NULL)
  736. abort ();
  737. }
  738. else
  739. {
  740. int n;
  741. if (in2size == 0)
  742. abort ();
  743. n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
  744. in2size);
  745. in2ptr += n;
  746. in2size -= n;
  747. }
  748. if (handler == iconveh_escape_sequence)
  749. {
  750. static char hex[16] = "0123456789ABCDEF";
  751. scratchlen = 0;
  752. scratchbuf[scratchlen++] = '\\';
  753. if (uc < 0x10000)
  754. scratchbuf[scratchlen++] = 'u';
  755. else
  756. {
  757. scratchbuf[scratchlen++] = 'U';
  758. scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
  759. scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
  760. scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
  761. scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
  762. }
  763. scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
  764. scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
  765. scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
  766. scratchbuf[scratchlen++] = hex[uc & 15];
  767. }
  768. else if (handler == iconveh_replacement_character)
  769. {
  770. /* U+FFFD in UTF-8 encoding. */
  771. scratchbuf[0] = '\357';
  772. scratchbuf[1] = '\277';
  773. scratchbuf[2] = '\275';
  774. scratchlen = 3;
  775. }
  776. else
  777. {
  778. scratchbuf[0] = '?';
  779. scratchlen = 1;
  780. }
  781. inptr = scratchbuf;
  782. insize = scratchlen;
  783. if (cd2 != (iconv_t)(-1))
  784. {
  785. char *out2ptr_try = out2ptr;
  786. size_t out2size_try = out2size;
  787. res = iconv (cd2,
  788. (ICONV_CONST char **) &inptr, &insize,
  789. &out2ptr_try, &out2size_try);
  790. if (handler == iconveh_replacement_character
  791. && (res == (size_t)(-1)
  792. ? errno == EILSEQ
  793. /* FreeBSD iconv(), NetBSD iconv(), and
  794. Solaris 11 iconv() insert a '?' if they
  795. cannot convert. This is what we want.
  796. But IRIX iconv() inserts a NUL byte if it
  797. cannot convert.
  798. And musl libc iconv() inserts a '*' if it
  799. cannot convert. */
  800. : (res > 0
  801. && !(out2ptr_try - out2ptr == 1
  802. && *out2ptr == '?'))))
  803. {
  804. /* The iconv() call failed.
  805. U+FFFD can't be converted to TO_CODESET.
  806. Use '?' instead. */
  807. scratchbuf[0] = '?';
  808. scratchlen = 1;
  809. inptr = scratchbuf;
  810. insize = scratchlen;
  811. res = iconv (cd2,
  812. (ICONV_CONST char **) &inptr, &insize,
  813. &out2ptr, &out2size);
  814. }
  815. else
  816. {
  817. /* Accept the results of the iconv() call. */
  818. out2ptr = out2ptr_try;
  819. out2size = out2size_try;
  820. res = 0;
  821. }
  822. }
  823. else
  824. {
  825. /* TO_CODESET is UTF-8. */
  826. if (out2size >= insize)
  827. {
  828. memcpy (out2ptr, inptr, insize);
  829. out2ptr += insize;
  830. out2size -= insize;
  831. inptr += insize;
  832. insize = 0;
  833. res = 0;
  834. }
  835. else
  836. {
  837. errno = E2BIG;
  838. res = (size_t)(-1);
  839. }
  840. }
  841. length = out2ptr - result;
  842. if (res == (size_t)(-1) && errno == E2BIG)
  843. {
  844. char *memory;
  845. allocated = 2 * allocated;
  846. if (length + 1 + extra_alloc > allocated)
  847. abort ();
  848. if (result == initial_result)
  849. memory = (char *) malloc (allocated);
  850. else
  851. memory = (char *) realloc (result, allocated);
  852. if (memory == NULL)
  853. {
  854. if (result != initial_result)
  855. free (result);
  856. errno = ENOMEM;
  857. return -1;
  858. }
  859. if (result == initial_result)
  860. memcpy (memory, initial_result, length);
  861. result = memory;
  862. grow = false;
  863. out2ptr = result + length;
  864. out2size = allocated - extra_alloc - length;
  865. if (cd2 != (iconv_t)(-1))
  866. res = iconv (cd2,
  867. (ICONV_CONST char **) &inptr,
  868. &insize,
  869. &out2ptr, &out2size);
  870. else
  871. {
  872. /* TO_CODESET is UTF-8. */
  873. if (!(out2size >= insize))
  874. abort ();
  875. memcpy (out2ptr, inptr, insize);
  876. out2ptr += insize;
  877. out2size -= insize;
  878. inptr += insize;
  879. insize = 0;
  880. res = 0;
  881. }
  882. length = out2ptr - result;
  883. }
  884. # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
  885. /* IRIX iconv() inserts a NUL byte if it cannot convert.
  886. FreeBSD iconv(), NetBSD iconv(), and Solaris 11
  887. iconv() insert a '?' if they cannot convert.
  888. musl libc iconv() inserts a '*' if it cannot convert.
  889. Only GNU libiconv and GNU libc are known to prefer
  890. to fail rather than doing a lossy conversion. */
  891. if (res != (size_t)(-1) && res > 0)
  892. {
  893. errno = EILSEQ;
  894. res = (size_t)(-1);
  895. }
  896. # endif
  897. if (res == (size_t)(-1))
  898. {
  899. /* Failure converting the ASCII replacement. */
  900. if (result != initial_result)
  901. free (result);
  902. return -1;
  903. }
  904. }
  905. else
  906. {
  907. if (result != initial_result)
  908. free (result);
  909. return -1;
  910. }
  911. }
  912. if (!(in2size > 0
  913. || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
  914. break;
  915. if (grow)
  916. {
  917. char *memory;
  918. allocated = 2 * allocated;
  919. if (result == initial_result)
  920. memory = (char *) malloc (allocated);
  921. else
  922. memory = (char *) realloc (result, allocated);
  923. if (memory == NULL)
  924. {
  925. if (result != initial_result)
  926. free (result);
  927. errno = ENOMEM;
  928. return -1;
  929. }
  930. if (result == initial_result)
  931. memcpy (memory, initial_result, length);
  932. result = memory;
  933. }
  934. }
  935. /* Move the remaining bytes to the beginning of utf8buf. */
  936. if (in2size > 0)
  937. memmove (utf8buf, in2ptr, in2size);
  938. utf8len = in2size;
  939. }
  940. if (res1 == (size_t)(-1))
  941. {
  942. if (errno1 == EINVAL)
  943. in1size = 0;
  944. else if (errno1 == EILSEQ)
  945. {
  946. if (result != initial_result)
  947. free (result);
  948. errno = errno1;
  949. return -1;
  950. }
  951. }
  952. }
  953. # undef utf8bufsize
  954. }
  955. done:
  956. /* Now the final memory allocation. */
  957. if (result == tmpbuf)
  958. {
  959. size_t memsize = length + extra_alloc;
  960. if (*resultp != NULL && *lengthp >= memsize)
  961. result = *resultp;
  962. else
  963. {
  964. char *memory;
  965. memory = (char *) malloc (memsize > 0 ? memsize : 1);
  966. if (memory != NULL)
  967. result = memory;
  968. else
  969. {
  970. errno = ENOMEM;
  971. return -1;
  972. }
  973. }
  974. memcpy (result, tmpbuf, length);
  975. }
  976. else if (result != *resultp && length + extra_alloc < allocated)
  977. {
  978. /* Shrink the allocated memory if possible. */
  979. size_t memsize = length + extra_alloc;
  980. char *memory;
  981. memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
  982. if (memory != NULL)
  983. result = memory;
  984. }
  985. *resultp = result;
  986. *lengthp = length;
  987. return 0;
  988. # undef tmpbuf
  989. # undef tmpbufsize
  990. }
  991. int
  992. mem_cd_iconveh (const char *src, size_t srclen,
  993. const iconveh_t *cd,
  994. enum iconv_ilseq_handler handler,
  995. size_t *offsets,
  996. char **resultp, size_t *lengthp)
  997. {
  998. return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
  999. handler, 0, offsets, resultp, lengthp);
  1000. }
  1001. char *
  1002. str_cd_iconveh (const char *src,
  1003. const iconveh_t *cd,
  1004. enum iconv_ilseq_handler handler)
  1005. {
  1006. /* For most encodings, a trailing NUL byte in the input will be converted
  1007. to a trailing NUL byte in the output. But not for UTF-7. So that this
  1008. function is usable for UTF-7, we have to exclude the NUL byte from the
  1009. conversion and add it by hand afterwards. */
  1010. char *result = NULL;
  1011. size_t length = 0;
  1012. int retval = mem_cd_iconveh_internal (src, strlen (src),
  1013. cd->cd, cd->cd1, cd->cd2, handler, 1,
  1014. NULL, &result, &length);
  1015. if (retval < 0)
  1016. {
  1017. free (result);
  1018. return NULL;
  1019. }
  1020. /* Add the terminating NUL byte. */
  1021. result[length] = '\0';
  1022. return result;
  1023. }
  1024. #endif
  1025. int
  1026. mem_iconveh (const char *src, size_t srclen,
  1027. const char *from_codeset, const char *to_codeset,
  1028. enum iconv_ilseq_handler handler,
  1029. size_t *offsets,
  1030. char **resultp, size_t *lengthp)
  1031. {
  1032. if (srclen == 0)
  1033. {
  1034. /* Nothing to convert. */
  1035. *lengthp = 0;
  1036. return 0;
  1037. }
  1038. else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
  1039. {
  1040. char *result;
  1041. if (*resultp != NULL && *lengthp >= srclen)
  1042. result = *resultp;
  1043. else
  1044. {
  1045. result = (char *) malloc (srclen);
  1046. if (result == NULL)
  1047. {
  1048. errno = ENOMEM;
  1049. return -1;
  1050. }
  1051. }
  1052. memcpy (result, src, srclen);
  1053. *resultp = result;
  1054. *lengthp = srclen;
  1055. return 0;
  1056. }
  1057. else
  1058. {
  1059. #if HAVE_ICONV
  1060. iconveh_t cd;
  1061. char *result;
  1062. size_t length;
  1063. int retval;
  1064. if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
  1065. return -1;
  1066. result = *resultp;
  1067. length = *lengthp;
  1068. retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
  1069. &result, &length);
  1070. if (retval < 0)
  1071. {
  1072. /* Close cd, but preserve the errno from str_cd_iconv. */
  1073. int saved_errno = errno;
  1074. iconveh_close (&cd);
  1075. errno = saved_errno;
  1076. }
  1077. else
  1078. {
  1079. if (iconveh_close (&cd) < 0)
  1080. {
  1081. if (result != *resultp)
  1082. free (result);
  1083. return -1;
  1084. }
  1085. *resultp = result;
  1086. *lengthp = length;
  1087. }
  1088. return retval;
  1089. #else
  1090. /* This is a different error code than if iconv_open existed but didn't
  1091. support from_codeset and to_codeset, so that the caller can emit
  1092. an error message such as
  1093. "iconv() is not supported. Installing GNU libiconv and
  1094. then reinstalling this package would fix this." */
  1095. errno = ENOSYS;
  1096. return -1;
  1097. #endif
  1098. }
  1099. }
  1100. char *
  1101. str_iconveh (const char *src,
  1102. const char *from_codeset, const char *to_codeset,
  1103. enum iconv_ilseq_handler handler)
  1104. {
  1105. if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
  1106. {
  1107. char *result = strdup (src);
  1108. if (result == NULL)
  1109. errno = ENOMEM;
  1110. return result;
  1111. }
  1112. else
  1113. {
  1114. #if HAVE_ICONV
  1115. iconveh_t cd;
  1116. char *result;
  1117. if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
  1118. return NULL;
  1119. result = str_cd_iconveh (src, &cd, handler);
  1120. if (result == NULL)
  1121. {
  1122. /* Close cd, but preserve the errno from str_cd_iconv. */
  1123. int saved_errno = errno;
  1124. iconveh_close (&cd);
  1125. errno = saved_errno;
  1126. }
  1127. else
  1128. {
  1129. if (iconveh_close (&cd) < 0)
  1130. {
  1131. free (result);
  1132. return NULL;
  1133. }
  1134. }
  1135. return result;
  1136. #else
  1137. /* This is a different error code than if iconv_open existed but didn't
  1138. support from_codeset and to_codeset, so that the caller can emit
  1139. an error message such as
  1140. "iconv() is not supported. Installing GNU libiconv and
  1141. then reinstalling this package would fix this." */
  1142. errno = ENOSYS;
  1143. return NULL;
  1144. #endif
  1145. }
  1146. }