str.c 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. /*-
  2. * SPDX-License-Identifier: BSD-3-Clause
  3. *
  4. * Copyright (c) 1991, 1993
  5. * The Regents of the University of California. All rights reserved.
  6. *
  7. * Redistribution and use in source and binary forms, with or without
  8. * modification, are permitted provided that the following conditions
  9. * are met:
  10. * 1. Redistributions of source code must retain the above copyright
  11. * notice, this list of conditions and the following disclaimer.
  12. * 2. Redistributions in binary form must reproduce the above copyright
  13. * notice, this list of conditions and the following disclaimer in the
  14. * documentation and/or other materials provided with the distribution.
  15. * 3. Neither the name of the University nor the names of its contributors
  16. * may be used to endorse or promote products derived from this software
  17. * without specific prior written permission.
  18. *
  19. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  20. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  23. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  25. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  26. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  28. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  29. * SUCH DAMAGE.
  30. */
  31. #include <sys/types.h>
  32. #include <ctype.h>
  33. #include <err.h>
  34. #include <errno.h>
  35. #include <stddef.h>
  36. #include <stdio.h>
  37. #include <stdlib.h>
  38. #include <string.h>
  39. #include <wchar.h>
  40. #include <wctype.h>
  41. #include "extern.h"
  42. static int backslash(STR *, int *);
  43. static int bracket(STR *);
  44. static void genclass(STR *);
  45. static void genequiv(STR *);
  46. static int genrange(STR *, int);
  47. static void genseq(STR *);
  48. wint_t
  49. next(STR *s)
  50. {
  51. int is_octal;
  52. wint_t ch;
  53. wchar_t wch;
  54. size_t clen;
  55. switch (s->state) {
  56. case EOS:
  57. return (0);
  58. case INFINITE:
  59. return (1);
  60. case NORMAL:
  61. switch (*s->str) {
  62. case '\0':
  63. s->state = EOS;
  64. return (0);
  65. case '\\':
  66. s->lastch = backslash(s, &is_octal);
  67. break;
  68. case '[':
  69. if (bracket(s))
  70. return (next(s));
  71. /* FALLTHROUGH */
  72. default:
  73. clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL);
  74. if (clen == (size_t)-1 || clen == (size_t)-2 ||
  75. clen == 0)
  76. errc(1, EILSEQ, NULL);
  77. is_octal = 0;
  78. s->lastch = wch;
  79. s->str += clen;
  80. break;
  81. }
  82. /* We can start a range at any time. */
  83. if (s->str[0] == '-' && genrange(s, is_octal))
  84. return (next(s));
  85. return (1);
  86. case RANGE:
  87. if (s->cnt-- == 0) {
  88. s->state = NORMAL;
  89. return (next(s));
  90. }
  91. ++s->lastch;
  92. return (1);
  93. case SEQUENCE:
  94. if (s->cnt-- == 0) {
  95. s->state = NORMAL;
  96. return (next(s));
  97. }
  98. return (1);
  99. case CCLASS:
  100. case CCLASS_UPPER:
  101. case CCLASS_LOWER:
  102. s->cnt++;
  103. ch = nextwctype(s->lastch, s->cclass);
  104. if (ch == -1) {
  105. s->state = NORMAL;
  106. return (next(s));
  107. }
  108. s->lastch = ch;
  109. return (1);
  110. case SET:
  111. if ((ch = s->set[s->cnt++]) == OOBCH) {
  112. s->state = NORMAL;
  113. return (next(s));
  114. }
  115. s->lastch = ch;
  116. return (1);
  117. default:
  118. return (0);
  119. }
  120. /* NOTREACHED */
  121. }
  122. static int
  123. bracket(STR *s)
  124. {
  125. char *p;
  126. switch (s->str[1]) {
  127. case ':': /* "[:class:]" */
  128. if ((p = strchr(s->str + 2, ']')) == NULL)
  129. return (0);
  130. if (*(p - 1) != ':' || p - s->str < 4)
  131. goto repeat;
  132. *(p - 1) = '\0';
  133. s->str += 2;
  134. genclass(s);
  135. s->str = p + 1;
  136. return (1);
  137. case '=': /* "[=equiv=]" */
  138. if (s->str[2] == '\0' || (p = strchr(s->str + 3, ']')) == NULL)
  139. return (0);
  140. if (*(p - 1) != '=' || p - s->str < 4)
  141. goto repeat;
  142. s->str += 2;
  143. genequiv(s);
  144. return (1);
  145. default: /* "[\###*n]" or "[#*n]" */
  146. repeat:
  147. if ((p = strpbrk(s->str + 2, "*]")) == NULL)
  148. return (0);
  149. if (p[0] != '*' || strchr(p, ']') == NULL)
  150. return (0);
  151. s->str += 1;
  152. genseq(s);
  153. return (1);
  154. }
  155. /* NOTREACHED */
  156. }
  157. static void
  158. genclass(STR *s)
  159. {
  160. if ((s->cclass = wctype(s->str)) == 0)
  161. errx(1, "unknown class %s", s->str);
  162. s->cnt = 0;
  163. s->lastch = -1; /* incremented before check in next() */
  164. if (strcmp(s->str, "upper") == 0)
  165. s->state = CCLASS_UPPER;
  166. else if (strcmp(s->str, "lower") == 0)
  167. s->state = CCLASS_LOWER;
  168. else
  169. s->state = CCLASS;
  170. }
  171. static void
  172. genequiv(STR *s)
  173. {
  174. int i, p, pri;
  175. char src[2], dst[3];
  176. size_t clen;
  177. wchar_t wc;
  178. if (*s->str == '\\') {
  179. s->equiv[0] = backslash(s, NULL);
  180. if (*s->str != '=')
  181. errx(1, "misplaced equivalence equals sign");
  182. s->str += 2;
  183. } else {
  184. clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
  185. if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0)
  186. errc(1, EILSEQ, NULL);
  187. s->equiv[0] = wc;
  188. if (s->str[clen] != '=')
  189. errx(1, "misplaced equivalence equals sign");
  190. s->str += clen + 2;
  191. }
  192. /*
  193. * Calculate the set of all characters in the same equivalence class
  194. * as the specified character (they will have the same primary
  195. * collation weights).
  196. * XXX Knows too much about how strxfrm() is implemented. Assumes
  197. * it fills the string with primary collation weight bytes. Only one-
  198. * to-one mappings are supported.
  199. * XXX Equivalence classes not supported in multibyte locales.
  200. */
  201. src[0] = (char)s->equiv[0];
  202. src[1] = '\0';
  203. if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) {
  204. pri = (unsigned char)*dst;
  205. for (p = 1, i = 1; i < NCHARS_SB; i++) {
  206. *src = i;
  207. if (strxfrm(dst, src, sizeof(dst)) == 1 && pri &&
  208. pri == (unsigned char)*dst)
  209. s->equiv[p++] = i;
  210. }
  211. s->equiv[p] = OOBCH;
  212. }
  213. s->cnt = 0;
  214. s->state = SET;
  215. s->set = s->equiv;
  216. }
  217. static int
  218. genrange(STR *s, int was_octal)
  219. {
  220. int stopval, octal;
  221. char *savestart;
  222. int n, cnt, *p;
  223. size_t clen;
  224. wchar_t wc;
  225. octal = 0;
  226. savestart = s->str;
  227. if (*++s->str == '\\')
  228. stopval = backslash(s, &octal);
  229. else {
  230. clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
  231. if (clen == (size_t)-1 || clen == (size_t)-2)
  232. errc(1, EILSEQ, NULL);
  233. stopval = wc;
  234. s->str += clen;
  235. }
  236. /*
  237. * XXX Characters are not ordered according to collating sequence in
  238. * multibyte locales.
  239. */
  240. if (octal || was_octal || MB_CUR_MAX > 1) {
  241. if (stopval < s->lastch) {
  242. s->str = savestart;
  243. return (0);
  244. }
  245. s->cnt = stopval - s->lastch + 1;
  246. s->state = RANGE;
  247. --s->lastch;
  248. return (1);
  249. }
  250. if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
  251. s->str = savestart;
  252. return (0);
  253. }
  254. if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
  255. err(1, "genrange() malloc");
  256. for (cnt = 0; cnt < NCHARS_SB; cnt++)
  257. if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
  258. charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
  259. *p++ = cnt;
  260. *p = OOBCH;
  261. n = p - s->set;
  262. s->cnt = 0;
  263. s->state = SET;
  264. if (n > 1)
  265. mergesort(s->set, n, sizeof(*(s->set)), charcoll);
  266. return (1);
  267. }
  268. static void
  269. genseq(STR *s)
  270. {
  271. char *ep;
  272. wchar_t wc;
  273. size_t clen;
  274. if (s->which == STRING1)
  275. errx(1, "sequences only valid in string2");
  276. if (*s->str == '\\')
  277. s->lastch = backslash(s, NULL);
  278. else {
  279. clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
  280. if (clen == (size_t)-1 || clen == (size_t)-2)
  281. errc(1, EILSEQ, NULL);
  282. s->lastch = wc;
  283. s->str += clen;
  284. }
  285. if (*s->str != '*')
  286. errx(1, "misplaced sequence asterisk");
  287. switch (*++s->str) {
  288. case '\\':
  289. s->cnt = backslash(s, NULL);
  290. break;
  291. case ']':
  292. s->cnt = 0;
  293. ++s->str;
  294. break;
  295. default:
  296. if (isdigit((u_char)*s->str)) {
  297. s->cnt = strtol(s->str, &ep, 0);
  298. if (*ep == ']') {
  299. s->str = ep + 1;
  300. break;
  301. }
  302. }
  303. errx(1, "illegal sequence count");
  304. /* NOTREACHED */
  305. }
  306. s->state = s->cnt ? SEQUENCE : INFINITE;
  307. }
  308. /*
  309. * Translate \??? into a character. Up to 3 octal digits, if no digits either
  310. * an escape code or a literal character.
  311. */
  312. static int
  313. backslash(STR *s, int *is_octal)
  314. {
  315. int ch, cnt, val;
  316. if (is_octal != NULL)
  317. *is_octal = 0;
  318. for (cnt = val = 0;;) {
  319. ch = (u_char)*++s->str;
  320. if (!isdigit(ch) || ch > '7')
  321. break;
  322. val = val * 8 + ch - '0';
  323. if (++cnt == 3) {
  324. ++s->str;
  325. break;
  326. }
  327. }
  328. if (cnt) {
  329. if (is_octal != NULL)
  330. *is_octal = 1;
  331. return (val);
  332. }
  333. if (ch != '\0')
  334. ++s->str;
  335. switch (ch) {
  336. case 'a': /* escape characters */
  337. return ('\7');
  338. case 'b':
  339. return ('\b');
  340. case 'f':
  341. return ('\f');
  342. case 'n':
  343. return ('\n');
  344. case 'r':
  345. return ('\r');
  346. case 't':
  347. return ('\t');
  348. case 'v':
  349. return ('\13');
  350. case '\0': /* \" -> \ */
  351. s->state = EOS;
  352. return ('\\');
  353. default: /* \x" -> x */
  354. return (ch);
  355. }
  356. }