utf8.c 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. /* $OpenBSD: utf8.c,v 1.11 2020/05/01 06:28:52 djm Exp $ */
  2. /*
  3. * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org>
  4. *
  5. * Permission to use, copy, modify, and distribute this software for any
  6. * purpose with or without fee is hereby granted, provided that the above
  7. * copyright notice and this permission notice appear in all copies.
  8. *
  9. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  10. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  11. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  12. * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  13. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  14. * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  15. * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  16. */
  17. /*
  18. * Utility functions for multibyte-character handling,
  19. * in particular to sanitize untrusted strings for terminal output.
  20. */
  21. #include "includes.h"
  22. #include <sys/types.h>
  23. #ifdef HAVE_LANGINFO_H
  24. # include <langinfo.h>
  25. #endif
  26. #include <limits.h>
  27. #include <locale.h>
  28. #include <stdarg.h>
  29. #include <stdio.h>
  30. #include <stdlib.h>
  31. #include <string.h>
  32. #if defined(HAVE_STRNVIS) && defined(HAVE_VIS_H) && !defined(BROKEN_STRNVIS)
  33. # include <vis.h>
  34. #endif
  35. #ifdef HAVE_WCHAR_H
  36. # include <wchar.h>
  37. #endif
  38. #include "utf8.h"
  39. static int dangerous_locale(void);
  40. static int grow_dst(char **, size_t *, size_t, char **, size_t);
  41. /*
  42. * For US-ASCII and UTF-8 encodings, we can safely recover from
  43. * encoding errors and from non-printable characters. For any
  44. * other encodings, err to the side of caution and abort parsing:
  45. * For state-dependent encodings, recovery is impossible.
  46. * For arbitrary encodings, replacement of non-printable
  47. * characters would be non-trivial and too fragile.
  48. * The comments indicate what nl_langinfo(CODESET)
  49. * returns for US-ASCII on various operating systems.
  50. */
  51. static int
  52. dangerous_locale(void) {
  53. char *loc;
  54. loc = nl_langinfo(CODESET);
  55. return strcmp(loc, "UTF-8") != 0 &&
  56. strcmp(loc, "US-ASCII") != 0 && /* OpenBSD */
  57. strcmp(loc, "ANSI_X3.4-1968") != 0 && /* Linux */
  58. strcmp(loc, "ISO8859-1") != 0 && /* AIX */
  59. strcmp(loc, "646") != 0 && /* Solaris, NetBSD */
  60. strcmp(loc, "") != 0; /* Solaris 6 */
  61. }
  62. static int
  63. grow_dst(char **dst, size_t *sz, size_t maxsz, char **dp, size_t need)
  64. {
  65. char *tp;
  66. size_t tsz;
  67. if (*dp + need < *dst + *sz)
  68. return 0;
  69. tsz = *sz + 128;
  70. if (tsz > maxsz)
  71. tsz = maxsz;
  72. if ((tp = recallocarray(*dst, *sz, tsz, 1)) == NULL)
  73. return -1;
  74. *dp = tp + (*dp - *dst);
  75. *dst = tp;
  76. *sz = tsz;
  77. return 0;
  78. }
  79. /*
  80. * The following two functions limit the number of bytes written,
  81. * including the terminating '\0', to sz. Unless wp is NULL,
  82. * they limit the number of display columns occupied to *wp.
  83. * Whichever is reached first terminates the output string.
  84. * To stay close to the standard interfaces, they return the number of
  85. * non-NUL bytes that would have been written if both were unlimited.
  86. * If wp is NULL, newline, carriage return, and tab are allowed;
  87. * otherwise, the actual number of columns occupied by what was
  88. * written is returned in *wp.
  89. */
  90. int
  91. vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap)
  92. {
  93. char *src; /* Source string returned from vasprintf. */
  94. char *sp; /* Pointer into src. */
  95. char *dst; /* Destination string to be returned. */
  96. char *dp; /* Pointer into dst. */
  97. char *tp; /* Temporary pointer for dst. */
  98. size_t sz; /* Number of bytes allocated for dst. */
  99. wchar_t wc; /* Wide character at sp. */
  100. int len; /* Number of bytes in the character at sp. */
  101. int ret; /* Number of bytes needed to format src. */
  102. int width; /* Display width of the character wc. */
  103. int total_width, max_width, print;
  104. src = NULL;
  105. if ((ret = vasprintf(&src, fmt, ap)) <= 0)
  106. goto fail;
  107. sz = strlen(src) + 1;
  108. if ((dst = malloc(sz)) == NULL) {
  109. free(src);
  110. ret = -1;
  111. goto fail;
  112. }
  113. if (maxsz > INT_MAX)
  114. maxsz = INT_MAX;
  115. sp = src;
  116. dp = dst;
  117. ret = 0;
  118. print = 1;
  119. total_width = 0;
  120. max_width = wp == NULL ? INT_MAX : *wp;
  121. while (*sp != '\0') {
  122. if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) {
  123. (void)mbtowc(NULL, NULL, MB_CUR_MAX);
  124. if (dangerous_locale()) {
  125. ret = -1;
  126. break;
  127. }
  128. len = 1;
  129. width = -1;
  130. } else if (wp == NULL &&
  131. (wc == L'\n' || wc == L'\r' || wc == L'\t')) {
  132. /*
  133. * Don't use width uninitialized; the actual
  134. * value doesn't matter because total_width
  135. * is only returned for wp != NULL.
  136. */
  137. width = 0;
  138. } else if ((width = wcwidth(wc)) == -1 &&
  139. dangerous_locale()) {
  140. ret = -1;
  141. break;
  142. }
  143. /* Valid, printable character. */
  144. if (width >= 0) {
  145. if (print && (dp - dst >= (int)maxsz - len ||
  146. total_width > max_width - width))
  147. print = 0;
  148. if (print) {
  149. if (grow_dst(&dst, &sz, maxsz,
  150. &dp, len) == -1) {
  151. ret = -1;
  152. break;
  153. }
  154. total_width += width;
  155. memcpy(dp, sp, len);
  156. dp += len;
  157. }
  158. sp += len;
  159. if (ret >= 0)
  160. ret += len;
  161. continue;
  162. }
  163. /* Escaping required. */
  164. while (len > 0) {
  165. if (print && (dp - dst >= (int)maxsz - 4 ||
  166. total_width > max_width - 4))
  167. print = 0;
  168. if (print) {
  169. if (grow_dst(&dst, &sz, maxsz,
  170. &dp, 4) == -1) {
  171. ret = -1;
  172. break;
  173. }
  174. tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0);
  175. width = tp - dp;
  176. total_width += width;
  177. dp = tp;
  178. } else
  179. width = 4;
  180. len--;
  181. sp++;
  182. if (ret >= 0)
  183. ret += width;
  184. }
  185. if (len > 0)
  186. break;
  187. }
  188. free(src);
  189. *dp = '\0';
  190. *str = dst;
  191. if (wp != NULL)
  192. *wp = total_width;
  193. /*
  194. * If the string was truncated by the width limit but
  195. * would have fit into the size limit, the only sane way
  196. * to report the problem is using the return value, such
  197. * that the usual idiom "if (ret < 0 || ret >= sz) error"
  198. * works as expected.
  199. */
  200. if (ret < (int)maxsz && !print)
  201. ret = -1;
  202. return ret;
  203. fail:
  204. if (wp != NULL)
  205. *wp = 0;
  206. if (ret == 0) {
  207. *str = src;
  208. return 0;
  209. } else {
  210. *str = NULL;
  211. return -1;
  212. }
  213. }
  214. int
  215. snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...)
  216. {
  217. va_list ap;
  218. char *cp = NULL;
  219. int ret;
  220. va_start(ap, fmt);
  221. ret = vasnmprintf(&cp, sz, wp, fmt, ap);
  222. va_end(ap);
  223. if (cp != NULL) {
  224. (void)strlcpy(str, cp, sz);
  225. free(cp);
  226. } else
  227. *str = '\0';
  228. return ret;
  229. }
  230. int
  231. asmprintf(char **outp, size_t sz, int *wp, const char *fmt, ...)
  232. {
  233. va_list ap;
  234. int ret;
  235. *outp = NULL;
  236. va_start(ap, fmt);
  237. ret = vasnmprintf(outp, sz, wp, fmt, ap);
  238. va_end(ap);
  239. return ret;
  240. }
  241. /*
  242. * To stay close to the standard interfaces, the following functions
  243. * return the number of non-NUL bytes written.
  244. */
  245. int
  246. vfmprintf(FILE *stream, const char *fmt, va_list ap)
  247. {
  248. char *str = NULL;
  249. int ret;
  250. if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0) {
  251. free(str);
  252. return -1;
  253. }
  254. if (fputs(str, stream) == EOF)
  255. ret = -1;
  256. free(str);
  257. return ret;
  258. }
  259. int
  260. fmprintf(FILE *stream, const char *fmt, ...)
  261. {
  262. va_list ap;
  263. int ret;
  264. va_start(ap, fmt);
  265. ret = vfmprintf(stream, fmt, ap);
  266. va_end(ap);
  267. return ret;
  268. }
  269. int
  270. mprintf(const char *fmt, ...)
  271. {
  272. va_list ap;
  273. int ret;
  274. va_start(ap, fmt);
  275. ret = vfmprintf(stdout, fmt, ap);
  276. va_end(ap);
  277. return ret;
  278. }
  279. /*
  280. * Set up libc for multibyte output in the user's chosen locale.
  281. *
  282. * XXX: we are known to have problems with Turkish (i/I confusion) so we
  283. * deliberately fall back to the C locale for now. Longer term we should
  284. * always prefer to select C.[encoding] if possible, but there's no
  285. * standardisation in locales between systems, so we'll need to survey
  286. * what's out there first.
  287. */
  288. void
  289. msetlocale(void)
  290. {
  291. const char *vars[] = { "LC_ALL", "LC_CTYPE", "LANG", NULL };
  292. char *cp;
  293. int i;
  294. /*
  295. * We can't yet cope with dotless/dotted I in Turkish locales,
  296. * so fall back to the C locale for these.
  297. */
  298. for (i = 0; vars[i] != NULL; i++) {
  299. if ((cp = getenv(vars[i])) == NULL)
  300. continue;
  301. if (strncasecmp(cp, "TR", 2) != 0)
  302. break;
  303. /*
  304. * If we're in a UTF-8 locale then prefer to use
  305. * the C.UTF-8 locale (or equivalent) if it exists.
  306. */
  307. if ((strcasestr(cp, "UTF-8") != NULL ||
  308. strcasestr(cp, "UTF8") != NULL) &&
  309. (setlocale(LC_CTYPE, "C.UTF-8") != NULL ||
  310. setlocale(LC_CTYPE, "POSIX.UTF-8") != NULL))
  311. return;
  312. setlocale(LC_CTYPE, "C");
  313. return;
  314. }
  315. /* We can handle this locale */
  316. setlocale(LC_CTYPE, "");
  317. }