archive_string.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463
  1. /*-
  2. * Copyright (c) 2003-2007 Tim Kientzle
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. *
  14. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
  15. * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  16. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  17. * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
  18. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  19. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  20. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  21. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  23. * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24. */
  25. #include "archive_platform.h"
  26. __FBSDID("$FreeBSD: src/lib/libarchive/archive_string.c,v 1.17 2008/12/06 05:56:43 kientzle Exp $");
  27. /*
  28. * Basic resizable string support, to simplify manipulating arbitrary-sized
  29. * strings while minimizing heap activity.
  30. */
  31. #ifdef HAVE_STDLIB_H
  32. #include <stdlib.h>
  33. #endif
  34. #ifdef HAVE_STRING_H
  35. #include <string.h>
  36. #endif
  37. #ifdef HAVE_WCHAR_H
  38. #include <wchar.h>
  39. #endif
  40. #if defined(_WIN32) && !defined(__CYGWIN__)
  41. #include <windows.h>
  42. #endif
  43. #include "archive_private.h"
  44. #include "archive_string.h"
  45. struct archive_string *
  46. __archive_string_append(struct archive_string *as, const char *p, size_t s)
  47. {
  48. if (__archive_string_ensure(as, as->length + s + 1) == NULL)
  49. __archive_errx(1, "Out of memory");
  50. memcpy(as->s + as->length, p, s);
  51. as->s[as->length + s] = 0;
  52. as->length += s;
  53. return (as);
  54. }
  55. void
  56. __archive_string_copy(struct archive_string *dest, struct archive_string *src)
  57. {
  58. if (src->length == 0)
  59. dest->length = 0;
  60. else {
  61. if (__archive_string_ensure(dest, src->length + 1) == NULL)
  62. __archive_errx(1, "Out of memory");
  63. memcpy(dest->s, src->s, src->length);
  64. dest->length = src->length;
  65. dest->s[dest->length] = 0;
  66. }
  67. }
  68. void
  69. __archive_string_concat(struct archive_string *dest, struct archive_string *src)
  70. {
  71. if (src->length > 0) {
  72. if (__archive_string_ensure(dest, dest->length + src->length + 1) == NULL)
  73. __archive_errx(1, "Out of memory");
  74. memcpy(dest->s + dest->length, src->s, src->length);
  75. dest->length += src->length;
  76. dest->s[dest->length] = 0;
  77. }
  78. }
  79. void
  80. __archive_string_free(struct archive_string *as)
  81. {
  82. as->length = 0;
  83. as->buffer_length = 0;
  84. if (as->s != NULL) {
  85. free(as->s);
  86. as->s = NULL;
  87. }
  88. }
  89. /* Returns NULL on any allocation failure. */
  90. struct archive_string *
  91. __archive_string_ensure(struct archive_string *as, size_t s)
  92. {
  93. char *p;
  94. size_t new_length;
  95. /* If buffer is already big enough, don't reallocate. */
  96. if (as->s && (s <= as->buffer_length))
  97. return (as);
  98. /*
  99. * Growing the buffer at least exponentially ensures that
  100. * append operations are always linear in the number of
  101. * characters appended. Using a smaller growth rate for
  102. * larger buffers reduces memory waste somewhat at the cost of
  103. * a larger constant factor.
  104. */
  105. if (as->buffer_length < 32)
  106. /* Start with a minimum 32-character buffer. */
  107. new_length = 32;
  108. else if (as->buffer_length < 8192)
  109. /* Buffers under 8k are doubled for speed. */
  110. new_length = as->buffer_length + as->buffer_length;
  111. else {
  112. /* Buffers 8k and over grow by at least 25% each time. */
  113. new_length = as->buffer_length + as->buffer_length / 4;
  114. /* Be safe: If size wraps, fail. */
  115. if (new_length < as->buffer_length) {
  116. /* On failure, wipe the string and return NULL. */
  117. __archive_string_free(as);
  118. return (NULL);
  119. }
  120. }
  121. /*
  122. * The computation above is a lower limit to how much we'll
  123. * grow the buffer. In any case, we have to grow it enough to
  124. * hold the request.
  125. */
  126. if (new_length < s)
  127. new_length = s;
  128. /* Now we can reallocate the buffer. */
  129. p = (char *)realloc(as->s, new_length);
  130. if (p == NULL) {
  131. /* On failure, wipe the string and return NULL. */
  132. __archive_string_free(as);
  133. return (NULL);
  134. }
  135. as->s = p;
  136. as->buffer_length = new_length;
  137. return (as);
  138. }
  139. struct archive_string *
  140. __archive_strncat(struct archive_string *as, const void *_p, size_t n)
  141. {
  142. size_t s;
  143. const char *p, *pp;
  144. p = (const char *)_p;
  145. /* Like strlen(p), except won't examine positions beyond p[n]. */
  146. s = 0;
  147. pp = p;
  148. while (*pp && s < n) {
  149. pp++;
  150. s++;
  151. }
  152. return (__archive_string_append(as, p, s));
  153. }
  154. struct archive_string *
  155. __archive_strappend_char(struct archive_string *as, char c)
  156. {
  157. return (__archive_string_append(as, &c, 1));
  158. }
  159. /*
  160. * Translates a wide character string into UTF-8 and appends
  161. * to the archive_string. Note: returns NULL if conversion fails,
  162. * but still leaves a best-effort conversion in the argument as.
  163. */
  164. struct archive_string *
  165. __archive_strappend_w_utf8(struct archive_string *as, const wchar_t *w)
  166. {
  167. char *p;
  168. unsigned wc;
  169. char buff[256];
  170. struct archive_string *return_val = as;
  171. /*
  172. * Convert one wide char at a time into 'buff', whenever that
  173. * fills, append it to the string.
  174. */
  175. p = buff;
  176. while (*w != L'\0') {
  177. /* Flush the buffer when we have <=16 bytes free. */
  178. /* (No encoding has a single character >16 bytes.) */
  179. if ((size_t)(p - buff) >= (size_t)(sizeof(buff) - 16)) {
  180. *p = '\0';
  181. archive_strcat(as, buff);
  182. p = buff;
  183. }
  184. wc = *w++;
  185. /* If this is a surrogate pair, assemble the full code point.*/
  186. /* Note: wc must not be wchar_t here, because the full code
  187. * point can be more than 16 bits! */
  188. if (wc >= 0xD800 && wc <= 0xDBff
  189. && *w >= 0xDC00 && *w <= 0xDFFF) {
  190. wc -= 0xD800;
  191. wc *= 0x400;
  192. wc += (*w - 0xDC00);
  193. wc += 0x10000;
  194. ++w;
  195. }
  196. /* Translate code point to UTF8 */
  197. if (wc <= 0x7f) {
  198. *p++ = (char)wc;
  199. } else if (wc <= 0x7ff) {
  200. *p++ = 0xc0 | ((wc >> 6) & 0x1f);
  201. *p++ = 0x80 | (wc & 0x3f);
  202. } else if (wc <= 0xffff) {
  203. *p++ = 0xe0 | ((wc >> 12) & 0x0f);
  204. *p++ = 0x80 | ((wc >> 6) & 0x3f);
  205. *p++ = 0x80 | (wc & 0x3f);
  206. } else if (wc <= 0x1fffff) {
  207. *p++ = 0xf0 | ((wc >> 18) & 0x07);
  208. *p++ = 0x80 | ((wc >> 12) & 0x3f);
  209. *p++ = 0x80 | ((wc >> 6) & 0x3f);
  210. *p++ = 0x80 | (wc & 0x3f);
  211. } else {
  212. /* Unicode has no codes larger than 0x1fffff. */
  213. /* TODO: use \uXXXX escape here instead of ? */
  214. *p++ = '?';
  215. return_val = NULL;
  216. }
  217. }
  218. *p = '\0';
  219. archive_strcat(as, buff);
  220. return (return_val);
  221. }
  222. static int
  223. utf8_to_unicode(int *pwc, const char *s, size_t n)
  224. {
  225. int ch;
  226. /*
  227. * Decode 1-4 bytes depending on the value of the first byte.
  228. */
  229. ch = (unsigned char)*s;
  230. if (ch == 0) {
  231. return (0); /* Standard: return 0 for end-of-string. */
  232. }
  233. if ((ch & 0x80) == 0) {
  234. *pwc = ch & 0x7f;
  235. return (1);
  236. }
  237. if ((ch & 0xe0) == 0xc0) {
  238. if (n < 2)
  239. return (-1);
  240. if ((s[1] & 0xc0) != 0x80) return (-1);
  241. *pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f);
  242. return (2);
  243. }
  244. if ((ch & 0xf0) == 0xe0) {
  245. if (n < 3)
  246. return (-1);
  247. if ((s[1] & 0xc0) != 0x80) return (-1);
  248. if ((s[2] & 0xc0) != 0x80) return (-1);
  249. *pwc = ((ch & 0x0f) << 12)
  250. | ((s[1] & 0x3f) << 6)
  251. | (s[2] & 0x3f);
  252. return (3);
  253. }
  254. if ((ch & 0xf8) == 0xf0) {
  255. if (n < 4)
  256. return (-1);
  257. if ((s[1] & 0xc0) != 0x80) return (-1);
  258. if ((s[2] & 0xc0) != 0x80) return (-1);
  259. if ((s[3] & 0xc0) != 0x80) return (-1);
  260. *pwc = ((ch & 0x07) << 18)
  261. | ((s[1] & 0x3f) << 12)
  262. | ((s[2] & 0x3f) << 6)
  263. | (s[3] & 0x3f);
  264. return (4);
  265. }
  266. /* Invalid first byte. */
  267. return (-1);
  268. }
  269. /*
  270. * Return a wide-character Unicode string by converting this archive_string
  271. * from UTF-8. We assume that systems with 16-bit wchar_t always use
  272. * UTF16 and systems with 32-bit wchar_t can accept UCS4.
  273. */
  274. wchar_t *
  275. __archive_string_utf8_w(struct archive_string *as)
  276. {
  277. wchar_t *ws, *dest;
  278. int wc, wc2;/* Must be large enough for a 21-bit Unicode code point. */
  279. const char *src, *end;
  280. int n;
  281. ws = (wchar_t *)malloc((as->length + 1) * sizeof(wchar_t));
  282. if (ws == NULL)
  283. __archive_errx(1, "Out of memory");
  284. dest = ws;
  285. src = as->s;
  286. end = as->s + as->buffer_length;
  287. while (*src != '\0') {
  288. n = utf8_to_unicode(&wc, src, end - src);
  289. if (n == 0)
  290. break;
  291. if (n < 0) {
  292. free(ws);
  293. return (NULL);
  294. }
  295. src += n;
  296. if (wc >= 0xD800 && wc <= 0xDBFF) {
  297. /* This is a leading surrogate; some idiot
  298. * has translated UTF16 to UTF8 without combining
  299. * surrogates; rebuild the full code point before
  300. * continuing. */
  301. n = utf8_to_unicode(&wc2, src, end - src);
  302. if (n < 0) {
  303. free(ws);
  304. return (NULL);
  305. }
  306. if (n == 0) /* Ignore the leading surrogate */
  307. break;
  308. if (wc2 < 0xDC00 || wc2 > 0xDFFF) {
  309. /* If the second character isn't a
  310. * trailing surrogate, then someone
  311. * has really screwed up and this is
  312. * invalid. */
  313. free(ws);
  314. return (NULL);
  315. } else {
  316. src += n;
  317. wc -= 0xD800;
  318. wc *= 0x400;
  319. wc += wc2 - 0xDC00;
  320. wc += 0x10000;
  321. }
  322. }
  323. if ((sizeof(wchar_t) < 4) && (wc > 0xffff)) {
  324. /* We have a code point that won't fit into a
  325. * wchar_t; convert it to a surrogate pair. */
  326. wc -= 0x10000;
  327. *dest++ = ((wc >> 10) & 0x3ff) + 0xD800;
  328. *dest++ = (wc & 0x3ff) + 0xDC00;
  329. } else
  330. *dest++ = wc;
  331. }
  332. *dest++ = L'\0';
  333. return (ws);
  334. }
  335. #if defined(_WIN32) && !defined(__CYGWIN__)
  336. /*
  337. * Translates a wide character string into current locale character set
  338. * and appends to the archive_string. Note: returns NULL if conversion
  339. * fails.
  340. *
  341. * Win32 builds use WideCharToMultiByte from the Windows API.
  342. * (Maybe Cygwin should too? WideCharToMultiByte will know a
  343. * lot more about local character encodings than the wcrtomb()
  344. * wrapper is going to know.)
  345. */
  346. struct archive_string *
  347. __archive_strappend_w_mbs(struct archive_string *as, const wchar_t *w)
  348. {
  349. char *p;
  350. int l, wl;
  351. BOOL useDefaultChar = FALSE;
  352. wl = (int)wcslen(w);
  353. l = wl * 4 + 4;
  354. p = malloc(l);
  355. if (p == NULL)
  356. __archive_errx(1, "Out of memory");
  357. /* To check a useDefaultChar is to simulate error handling of
  358. * the my_wcstombs() which is running on non Windows system with
  359. * wctomb().
  360. * And to set NULL for last argument is necessary when a codepage
  361. * is not CP_ACP(current locale).
  362. */
  363. l = WideCharToMultiByte(CP_ACP, 0, w, wl, p, l, NULL, &useDefaultChar);
  364. if (l == 0) {
  365. free(p);
  366. return (NULL);
  367. }
  368. __archive_string_append(as, p, l);
  369. free(p);
  370. return (as);
  371. }
  372. #else
  373. /*
  374. * Translates a wide character string into current locale character set
  375. * and appends to the archive_string. Note: returns NULL if conversion
  376. * fails.
  377. *
  378. * Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion
  379. * one character at a time. If a non-Windows platform doesn't have
  380. * either of these, fall back to the built-in UTF8 conversion.
  381. */
  382. struct archive_string *
  383. __archive_strappend_w_mbs(struct archive_string *as, const wchar_t *w)
  384. {
  385. #if !defined(HAVE_WCTOMB) && !defined(HAVE_WCRTOMB)
  386. /* If there's no built-in locale support, fall back to UTF8 always. */
  387. return __archive_strappend_w_utf8(as, w);
  388. #else
  389. /* We cannot use the standard wcstombs() here because it
  390. * cannot tell us how big the output buffer should be. So
  391. * I've built a loop around wcrtomb() or wctomb() that
  392. * converts a character at a time and resizes the string as
  393. * needed. We prefer wcrtomb() when it's available because
  394. * it's thread-safe. */
  395. int n;
  396. char *p;
  397. char buff[256];
  398. #if HAVE_WCRTOMB
  399. mbstate_t shift_state;
  400. memset(&shift_state, 0, sizeof(shift_state));
  401. #else
  402. /* Clear the shift state before starting. */
  403. wctomb(NULL, L'\0');
  404. #endif
  405. /*
  406. * Convert one wide char at a time into 'buff', whenever that
  407. * fills, append it to the string.
  408. */
  409. p = buff;
  410. while (*w != L'\0') {
  411. /* Flush the buffer when we have <=16 bytes free. */
  412. /* (No encoding has a single character >16 bytes.) */
  413. if ((size_t)(p - buff) >= (size_t)(sizeof(buff) - MB_CUR_MAX)) {
  414. *p = '\0';
  415. archive_strcat(as, buff);
  416. p = buff;
  417. }
  418. #if HAVE_WCRTOMB
  419. n = wcrtomb(p, *w++, &shift_state);
  420. #else
  421. n = wctomb(p, *w++);
  422. #endif
  423. if (n == -1)
  424. return (NULL);
  425. p += n;
  426. }
  427. *p = '\0';
  428. archive_strcat(as, buff);
  429. return (as);
  430. #endif
  431. }
  432. #endif /* _WIN32 && ! __CYGWIN__ */