123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356 |
- /* $OpenBSD: utf8.c,v 1.11 2020/05/01 06:28:52 djm Exp $ */
- /*
- * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
- /*
- * Utility functions for multibyte-character handling,
- * in particular to sanitize untrusted strings for terminal output.
- */
- #include "includes.h"
- #include <sys/types.h>
- #ifdef HAVE_LANGINFO_H
- # include <langinfo.h>
- #endif
- #include <limits.h>
- #include <locale.h>
- #include <stdarg.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #if defined(HAVE_STRNVIS) && defined(HAVE_VIS_H) && !defined(BROKEN_STRNVIS)
- # include <vis.h>
- #endif
- #ifdef HAVE_WCHAR_H
- # include <wchar.h>
- #endif
- #include "utf8.h"
- static int dangerous_locale(void);
- static int grow_dst(char **, size_t *, size_t, char **, size_t);
- /*
- * For US-ASCII and UTF-8 encodings, we can safely recover from
- * encoding errors and from non-printable characters. For any
- * other encodings, err to the side of caution and abort parsing:
- * For state-dependent encodings, recovery is impossible.
- * For arbitrary encodings, replacement of non-printable
- * characters would be non-trivial and too fragile.
- * The comments indicate what nl_langinfo(CODESET)
- * returns for US-ASCII on various operating systems.
- */
- static int
- dangerous_locale(void) {
- char *loc;
- loc = nl_langinfo(CODESET);
- return strcmp(loc, "UTF-8") != 0 &&
- strcmp(loc, "US-ASCII") != 0 && /* OpenBSD */
- strcmp(loc, "ANSI_X3.4-1968") != 0 && /* Linux */
- strcmp(loc, "ISO8859-1") != 0 && /* AIX */
- strcmp(loc, "646") != 0 && /* Solaris, NetBSD */
- strcmp(loc, "") != 0; /* Solaris 6 */
- }
- static int
- grow_dst(char **dst, size_t *sz, size_t maxsz, char **dp, size_t need)
- {
- char *tp;
- size_t tsz;
- if (*dp + need < *dst + *sz)
- return 0;
- tsz = *sz + 128;
- if (tsz > maxsz)
- tsz = maxsz;
- if ((tp = recallocarray(*dst, *sz, tsz, 1)) == NULL)
- return -1;
- *dp = tp + (*dp - *dst);
- *dst = tp;
- *sz = tsz;
- return 0;
- }
- /*
- * The following two functions limit the number of bytes written,
- * including the terminating '\0', to sz. Unless wp is NULL,
- * they limit the number of display columns occupied to *wp.
- * Whichever is reached first terminates the output string.
- * To stay close to the standard interfaces, they return the number of
- * non-NUL bytes that would have been written if both were unlimited.
- * If wp is NULL, newline, carriage return, and tab are allowed;
- * otherwise, the actual number of columns occupied by what was
- * written is returned in *wp.
- */
- int
- vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap)
- {
- char *src; /* Source string returned from vasprintf. */
- char *sp; /* Pointer into src. */
- char *dst; /* Destination string to be returned. */
- char *dp; /* Pointer into dst. */
- char *tp; /* Temporary pointer for dst. */
- size_t sz; /* Number of bytes allocated for dst. */
- wchar_t wc; /* Wide character at sp. */
- int len; /* Number of bytes in the character at sp. */
- int ret; /* Number of bytes needed to format src. */
- int width; /* Display width of the character wc. */
- int total_width, max_width, print;
- src = NULL;
- if ((ret = vasprintf(&src, fmt, ap)) <= 0)
- goto fail;
- sz = strlen(src) + 1;
- if ((dst = malloc(sz)) == NULL) {
- free(src);
- ret = -1;
- goto fail;
- }
- if (maxsz > INT_MAX)
- maxsz = INT_MAX;
- sp = src;
- dp = dst;
- ret = 0;
- print = 1;
- total_width = 0;
- max_width = wp == NULL ? INT_MAX : *wp;
- while (*sp != '\0') {
- if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) {
- (void)mbtowc(NULL, NULL, MB_CUR_MAX);
- if (dangerous_locale()) {
- ret = -1;
- break;
- }
- len = 1;
- width = -1;
- } else if (wp == NULL &&
- (wc == L'\n' || wc == L'\r' || wc == L'\t')) {
- /*
- * Don't use width uninitialized; the actual
- * value doesn't matter because total_width
- * is only returned for wp != NULL.
- */
- width = 0;
- } else if ((width = wcwidth(wc)) == -1 &&
- dangerous_locale()) {
- ret = -1;
- break;
- }
- /* Valid, printable character. */
- if (width >= 0) {
- if (print && (dp - dst >= (int)maxsz - len ||
- total_width > max_width - width))
- print = 0;
- if (print) {
- if (grow_dst(&dst, &sz, maxsz,
- &dp, len) == -1) {
- ret = -1;
- break;
- }
- total_width += width;
- memcpy(dp, sp, len);
- dp += len;
- }
- sp += len;
- if (ret >= 0)
- ret += len;
- continue;
- }
- /* Escaping required. */
- while (len > 0) {
- if (print && (dp - dst >= (int)maxsz - 4 ||
- total_width > max_width - 4))
- print = 0;
- if (print) {
- if (grow_dst(&dst, &sz, maxsz,
- &dp, 4) == -1) {
- ret = -1;
- break;
- }
- tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0);
- width = tp - dp;
- total_width += width;
- dp = tp;
- } else
- width = 4;
- len--;
- sp++;
- if (ret >= 0)
- ret += width;
- }
- if (len > 0)
- break;
- }
- free(src);
- *dp = '\0';
- *str = dst;
- if (wp != NULL)
- *wp = total_width;
- /*
- * If the string was truncated by the width limit but
- * would have fit into the size limit, the only sane way
- * to report the problem is using the return value, such
- * that the usual idiom "if (ret < 0 || ret >= sz) error"
- * works as expected.
- */
- if (ret < (int)maxsz && !print)
- ret = -1;
- return ret;
- fail:
- if (wp != NULL)
- *wp = 0;
- if (ret == 0) {
- *str = src;
- return 0;
- } else {
- *str = NULL;
- return -1;
- }
- }
- int
- snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...)
- {
- va_list ap;
- char *cp = NULL;
- int ret;
- va_start(ap, fmt);
- ret = vasnmprintf(&cp, sz, wp, fmt, ap);
- va_end(ap);
- if (cp != NULL) {
- (void)strlcpy(str, cp, sz);
- free(cp);
- } else
- *str = '\0';
- return ret;
- }
- int
- asmprintf(char **outp, size_t sz, int *wp, const char *fmt, ...)
- {
- va_list ap;
- int ret;
- *outp = NULL;
- va_start(ap, fmt);
- ret = vasnmprintf(outp, sz, wp, fmt, ap);
- va_end(ap);
- return ret;
- }
- /*
- * To stay close to the standard interfaces, the following functions
- * return the number of non-NUL bytes written.
- */
- int
- vfmprintf(FILE *stream, const char *fmt, va_list ap)
- {
- char *str = NULL;
- int ret;
- if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0) {
- free(str);
- return -1;
- }
- if (fputs(str, stream) == EOF)
- ret = -1;
- free(str);
- return ret;
- }
- int
- fmprintf(FILE *stream, const char *fmt, ...)
- {
- va_list ap;
- int ret;
- va_start(ap, fmt);
- ret = vfmprintf(stream, fmt, ap);
- va_end(ap);
- return ret;
- }
- int
- mprintf(const char *fmt, ...)
- {
- va_list ap;
- int ret;
- va_start(ap, fmt);
- ret = vfmprintf(stdout, fmt, ap);
- va_end(ap);
- return ret;
- }
- /*
- * Set up libc for multibyte output in the user's chosen locale.
- *
- * XXX: we are known to have problems with Turkish (i/I confusion) so we
- * deliberately fall back to the C locale for now. Longer term we should
- * always prefer to select C.[encoding] if possible, but there's no
- * standardisation in locales between systems, so we'll need to survey
- * what's out there first.
- */
- void
- msetlocale(void)
- {
- const char *vars[] = { "LC_ALL", "LC_CTYPE", "LANG", NULL };
- char *cp;
- int i;
- /*
- * We can't yet cope with dotless/dotted I in Turkish locales,
- * so fall back to the C locale for these.
- */
- for (i = 0; vars[i] != NULL; i++) {
- if ((cp = getenv(vars[i])) == NULL)
- continue;
- if (strncasecmp(cp, "TR", 2) != 0)
- break;
- /*
- * If we're in a UTF-8 locale then prefer to use
- * the C.UTF-8 locale (or equivalent) if it exists.
- */
- if ((strcasestr(cp, "UTF-8") != NULL ||
- strcasestr(cp, "UTF8") != NULL) &&
- (setlocale(LC_CTYPE, "C.UTF-8") != NULL ||
- setlocale(LC_CTYPE, "POSIX.UTF-8") != NULL))
- return;
- setlocale(LC_CTYPE, "C");
- return;
- }
- /* We can handle this locale */
- setlocale(LC_CTYPE, "");
- }
|