tokenizer.c 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398
  1. /* $NetBSD: tokenizer.c,v 1.10 2002/03/18 16:01:00 christos Exp $ */
  2. /*-
  3. * Copyright (c) 1992, 1993
  4. * The Regents of the University of California. All rights reserved.
  5. *
  6. * This code is derived from software contributed to Berkeley by
  7. * Christos Zoulas of Cornell University.
  8. *
  9. * Redistribution and use in source and binary forms, with or without
  10. * modification, are permitted provided that the following conditions
  11. * are met:
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. * 2. Redistributions in binary form must reproduce the above copyright
  15. * notice, this list of conditions and the following disclaimer in the
  16. * documentation and/or other materials provided with the distribution.
  17. * 3. All advertising materials mentioning features or use of this software
  18. * must display the following acknowledgement:
  19. * This product includes software developed by the University of
  20. * California, Berkeley and its contributors.
  21. * 4. Neither the name of the University nor the names of its contributors
  22. * may be used to endorse or promote products derived from this software
  23. * without specific prior written permission.
  24. *
  25. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  26. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  27. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  28. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  29. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  30. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  31. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  32. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  34. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  35. * SUCH DAMAGE.
  36. */
  37. #include "config.h"
  38. #if !defined(lint) && !defined(SCCSID)
  39. #if 0
  40. static char sccsid[] = "@(#)tokenizer.c 8.1 (Berkeley) 6/4/93";
  41. #else
  42. __RCSID("$NetBSD: tokenizer.c,v 1.10 2002/03/18 16:01:00 christos Exp $");
  43. #endif
  44. #endif /* not lint && not SCCSID */
  45. /*
  46. * tokenize.c: Bourne shell like tokenizer
  47. */
  48. #include <string.h>
  49. #include <stdlib.h>
  50. #include "tokenizer.h"
  51. typedef enum {
  52. Q_none, Q_single, Q_double, Q_one, Q_doubleone
  53. } quote_t;
  54. #define IFS "\t \n"
  55. #define TOK_KEEP 1
  56. #define TOK_EAT 2
  57. #define WINCR 20
  58. #define AINCR 10
  59. #define tok_malloc(a) malloc(a)
  60. #define tok_free(a) free(a)
  61. #define tok_realloc(a, b) realloc(a, b)
  62. struct tokenizer {
  63. char *ifs; /* In field separator */
  64. int argc, amax; /* Current and maximum number of args */
  65. char **argv; /* Argument list */
  66. char *wptr, *wmax; /* Space and limit on the word buffer */
  67. char *wstart; /* Beginning of next word */
  68. char *wspace; /* Space of word buffer */
  69. quote_t quote; /* Quoting state */
  70. int flags; /* flags; */
  71. };
  72. private void tok_finish(Tokenizer *);
  73. /* tok_finish():
  74. * Finish a word in the tokenizer.
  75. */
  76. private void
  77. tok_finish(Tokenizer *tok)
  78. {
  79. *tok->wptr = '\0';
  80. if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
  81. tok->argv[tok->argc++] = tok->wstart;
  82. tok->argv[tok->argc] = NULL;
  83. tok->wstart = ++tok->wptr;
  84. }
  85. tok->flags &= ~TOK_KEEP;
  86. }
  87. /* tok_init():
  88. * Initialize the tokenizer
  89. */
  90. public Tokenizer *
  91. tok_init(const char *ifs)
  92. {
  93. Tokenizer *tok = (Tokenizer *) tok_malloc(sizeof(Tokenizer));
  94. tok->ifs = strdup(ifs ? ifs : IFS);
  95. tok->argc = 0;
  96. tok->amax = AINCR;
  97. tok->argv = (char **) tok_malloc(sizeof(char *) * tok->amax);
  98. if (tok->argv == NULL)
  99. return (NULL);
  100. tok->argv[0] = NULL;
  101. tok->wspace = (char *) tok_malloc(WINCR);
  102. if (tok->wspace == NULL)
  103. return (NULL);
  104. tok->wmax = tok->wspace + WINCR;
  105. tok->wstart = tok->wspace;
  106. tok->wptr = tok->wspace;
  107. tok->flags = 0;
  108. tok->quote = Q_none;
  109. return (tok);
  110. }
  111. /* tok_reset():
  112. * Reset the tokenizer
  113. */
  114. public void
  115. tok_reset(Tokenizer *tok)
  116. {
  117. tok->argc = 0;
  118. tok->wstart = tok->wspace;
  119. tok->wptr = tok->wspace;
  120. tok->flags = 0;
  121. tok->quote = Q_none;
  122. }
  123. /* tok_end():
  124. * Clean up
  125. */
  126. public void
  127. tok_end(Tokenizer *tok)
  128. {
  129. tok_free((ptr_t) tok->ifs);
  130. tok_free((ptr_t) tok->wspace);
  131. tok_free((ptr_t) tok->argv);
  132. tok_free((ptr_t) tok);
  133. }
  134. /* tok_line():
  135. * Bourne shell like tokenizing
  136. * Return:
  137. * -1: Internal error
  138. * 3: Quoted return
  139. * 2: Unmatched double quote
  140. * 1: Unmatched single quote
  141. * 0: Ok
  142. */
  143. public int
  144. tok_line(Tokenizer *tok, const char *line, int *argc, const char ***argv)
  145. {
  146. const char *ptr;
  147. for (;;) {
  148. switch (*(ptr = line++)) {
  149. case '\'':
  150. tok->flags |= TOK_KEEP;
  151. tok->flags &= ~TOK_EAT;
  152. switch (tok->quote) {
  153. case Q_none:
  154. tok->quote = Q_single; /* Enter single quote
  155. * mode */
  156. break;
  157. case Q_single: /* Exit single quote mode */
  158. tok->quote = Q_none;
  159. break;
  160. case Q_one: /* Quote this ' */
  161. tok->quote = Q_none;
  162. *tok->wptr++ = *ptr;
  163. break;
  164. case Q_double: /* Stay in double quote mode */
  165. *tok->wptr++ = *ptr;
  166. break;
  167. case Q_doubleone: /* Quote this ' */
  168. tok->quote = Q_double;
  169. *tok->wptr++ = *ptr;
  170. break;
  171. default:
  172. return (-1);
  173. }
  174. break;
  175. case '"':
  176. tok->flags &= ~TOK_EAT;
  177. tok->flags |= TOK_KEEP;
  178. switch (tok->quote) {
  179. case Q_none: /* Enter double quote mode */
  180. tok->quote = Q_double;
  181. break;
  182. case Q_double: /* Exit double quote mode */
  183. tok->quote = Q_none;
  184. break;
  185. case Q_one: /* Quote this " */
  186. tok->quote = Q_none;
  187. *tok->wptr++ = *ptr;
  188. break;
  189. case Q_single: /* Stay in single quote mode */
  190. *tok->wptr++ = *ptr;
  191. break;
  192. case Q_doubleone: /* Quote this " */
  193. tok->quote = Q_double;
  194. *tok->wptr++ = *ptr;
  195. break;
  196. default:
  197. return (-1);
  198. }
  199. break;
  200. case '\\':
  201. tok->flags |= TOK_KEEP;
  202. tok->flags &= ~TOK_EAT;
  203. switch (tok->quote) {
  204. case Q_none: /* Quote next character */
  205. tok->quote = Q_one;
  206. break;
  207. case Q_double: /* Quote next character */
  208. tok->quote = Q_doubleone;
  209. break;
  210. case Q_one: /* Quote this, restore state */
  211. *tok->wptr++ = *ptr;
  212. tok->quote = Q_none;
  213. break;
  214. case Q_single: /* Stay in single quote mode */
  215. *tok->wptr++ = *ptr;
  216. break;
  217. case Q_doubleone: /* Quote this \ */
  218. tok->quote = Q_double;
  219. *tok->wptr++ = *ptr;
  220. break;
  221. default:
  222. return (-1);
  223. }
  224. break;
  225. case '\n':
  226. tok->flags &= ~TOK_EAT;
  227. switch (tok->quote) {
  228. case Q_none:
  229. tok_finish(tok);
  230. *argv = (const char **)tok->argv;
  231. *argc = tok->argc;
  232. return (0);
  233. case Q_single:
  234. case Q_double:
  235. *tok->wptr++ = *ptr; /* Add the return */
  236. break;
  237. case Q_doubleone: /* Back to double, eat the '\n' */
  238. tok->flags |= TOK_EAT;
  239. tok->quote = Q_double;
  240. break;
  241. case Q_one: /* No quote, more eat the '\n' */
  242. tok->flags |= TOK_EAT;
  243. tok->quote = Q_none;
  244. break;
  245. default:
  246. return (0);
  247. }
  248. break;
  249. case '\0':
  250. switch (tok->quote) {
  251. case Q_none:
  252. /* Finish word and return */
  253. if (tok->flags & TOK_EAT) {
  254. tok->flags &= ~TOK_EAT;
  255. return (3);
  256. }
  257. tok_finish(tok);
  258. *argv = (const char **)tok->argv;
  259. *argc = tok->argc;
  260. return (0);
  261. case Q_single:
  262. return (1);
  263. case Q_double:
  264. return (2);
  265. case Q_doubleone:
  266. tok->quote = Q_double;
  267. *tok->wptr++ = *ptr;
  268. break;
  269. case Q_one:
  270. tok->quote = Q_none;
  271. *tok->wptr++ = *ptr;
  272. break;
  273. default:
  274. return (-1);
  275. }
  276. break;
  277. default:
  278. tok->flags &= ~TOK_EAT;
  279. switch (tok->quote) {
  280. case Q_none:
  281. if (strchr(tok->ifs, *ptr) != NULL)
  282. tok_finish(tok);
  283. else
  284. *tok->wptr++ = *ptr;
  285. break;
  286. case Q_single:
  287. case Q_double:
  288. *tok->wptr++ = *ptr;
  289. break;
  290. case Q_doubleone:
  291. *tok->wptr++ = '\\';
  292. tok->quote = Q_double;
  293. *tok->wptr++ = *ptr;
  294. break;
  295. case Q_one:
  296. tok->quote = Q_none;
  297. *tok->wptr++ = *ptr;
  298. break;
  299. default:
  300. return (-1);
  301. }
  302. break;
  303. }
  304. if (tok->wptr >= tok->wmax - 4) {
  305. size_t size = tok->wmax - tok->wspace + WINCR;
  306. char *s = (char *) tok_realloc(tok->wspace, size);
  307. if (s == NULL)
  308. return (-1);
  309. if (s != tok->wspace) {
  310. int i;
  311. for (i = 0; i < tok->argc; i++) {
  312. tok->argv[i] =
  313. (tok->argv[i] - tok->wspace) + s;
  314. }
  315. tok->wptr = (tok->wptr - tok->wspace) + s;
  316. tok->wstart = (tok->wstart - tok->wspace) + s;
  317. tok->wspace = s;
  318. }
  319. tok->wmax = s + size;
  320. }
  321. if (tok->argc >= tok->amax - 4) {
  322. char **p;
  323. tok->amax += AINCR;
  324. p = (char **) tok_realloc(tok->argv,
  325. tok->amax * sizeof(char *));
  326. if (p == NULL)
  327. return (-1);
  328. tok->argv = p;
  329. }
  330. }
  331. }