fgrep.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637
  1. /*
  2. #define DEBUG /* compile code to display tree structure */
  3. /*
  4. * fgrep -- print all lines containing any of a set of keywords
  5. *
  6. * status returns:
  7. * 0 - ok, and some matches
  8. * 1 - ok, but no matches
  9. * 2 - some error
  10. */
  11. static char sccsid[] = "@(#)fgrep.c 2.0";
  12. /*
  13. * fgrep employs a generalisation of the Knuth-Morris-Pratt string matching
  14. * algorithm (ref: Baase, S: Computer Algorithms, 1978), which builds a
  15. * "flowchart" tree representing the patterns to be searched for. The tree
  16. * is constructed so that possible character matches in a particular position
  17. * in the composite pattern are linked together (using the "link" field of
  18. * struct words), and share a common parent tree. For example, the patterns
  19. * "roar" and "road" are represented by three nodes containing "r", "o" and
  20. * "a", linked by success pointers ("succ"), and two nodes "r" and "d"
  21. * joined by "link" pointers. The succ pointer of "a" points to the head
  22. * of this list. All unused pointers are NULL.
  23. *
  24. * The feature of this algorithm is that it is never necessary to back up
  25. * in the subject when matching fails. If a mismatch occurs, a failure link
  26. * is followed to an earlier point in the pattern which is known still to be
  27. * matched. For example, if the fourth character of "poppy" fails, we
  28. * should continue from the second character, as the first remains matched.
  29. * Thus "popoppy" will be matched with 8 comparisons.
  30. *
  31. * For exact matching (-x), a newline character is appended to each pattern.
  32. * No failure links are computed, as any mismatch causes total failure.
  33. *
  34. * Exception conditions - pattern building:
  35. * 1. If -x is not specified, the null pattern is represented by a NULL tree
  36. * and always matches; otherwise it is stored as a single newline.
  37. * 2. If a pattern is an initial substring of another, the longer string
  38. * is ignored (since the shorter matches both).
  39. * 3. If a pattern is a non-initial substring of another, the longer string
  40. * is truncated at the end of the common substring. Thus "pattern" and
  41. * "at" become "pat" and "at". The longer string should really be deleted,
  42. * but this is a non-trivial task as it may share a common prefix with
  43. * some other pattern (such as "pad"). The redundant string is harmless.
  44. * 4. Patterns longer than BUFSIZ are truncated.
  45. *
  46. * Exception conditions - matching:
  47. * 1. The length of lines able to be matched is unlimited, but only the first
  48. * BUFSIZ characters are retained for printing.
  49. * 2. If the last non-empty line in a file does not contain a newline,
  50. * one is added if the line is printed. It is not possible to exactly
  51. * match such a line with the -x option.
  52. *
  53. * Author: Geoff Whale
  54. * University of New South Wales, Sydney
  55. * May, 1984.
  56. */
  57. #include <stdio.h>
  58. #include <ctype.h>
  59. #define INCRSIZ 64
  60. #define QSIZE 128
  61. #if u370
  62. #define BLKSIZE 4096
  63. #else
  64. #define BLKSIZE BUFSIZ
  65. #endif
  66. struct words {
  67. char key;
  68. struct words *succ;
  69. struct words *link;
  70. struct words *fail;
  71. } w[INCRSIZ], /* first block, others obtained through walloc() */
  72. *tree = NULL, /* root of the KMP flowchart tree */
  73. *smax = w, /* last element used */
  74. *slim = &w[INCRSIZ]; /* limit for smax, then malloc some more */
  75. struct qpage {
  76. struct words *queue[QSIZE];
  77. struct qpage *nextpage;
  78. };
  79. int bflag, cflag, lflag, fflag, nflag, sflag, vflag, xflag, yflag, eflag;
  80. #ifdef DEBUG
  81. char *usage = "[ -bcdlnsvxy ] [ -f stringfile ] [ -e ] strings [ files... ]";
  82. char *optlist = "bcde:f:lmnsvxy";
  83. int dflag;
  84. #else
  85. char *usage = "[ -bclnsvxy ] [ -f stringfile ] [ -e ] strings [ files... ]";
  86. char *optlist = "bce:f:lmnsvxy";
  87. #endif DEBUG
  88. int numfiles; /* prepend filename to matches if >1 */
  89. int anymatches, notfound; /* for exit status */
  90. FILE *stringfile = NULL;
  91. char *argptr;
  92. extern char *optarg;
  93. extern int optind;
  94. extern char *malloc(), *calloc();
  95. #define match(ec,lc) (ec == lc || (yflag && isupper(lc) && ec == lc + 'a' - 'A'))
  96. main(argc, argv)
  97. char **argv;
  98. {
  99. register int c;
  100. int errflg = 0;
  101. while(( c = getopt(argc, argv, optlist)) != EOF)
  102. switch(c) {
  103. case 'b': /* block number prepended */
  104. bflag++;
  105. continue;
  106. case 'c': /* count of lines only */
  107. cflag++;
  108. continue;
  109. #ifdef DEBUG
  110. case 'd': /* debug/display option */
  111. dflag++;
  112. continue;
  113. #endif DEBUG
  114. case 'e': /* expression follows */
  115. eflag++;
  116. argptr = optarg;
  117. continue;
  118. case 'f': /* string file follows */
  119. fflag++;
  120. stringfile = fopen(optarg, "r");
  121. if (stringfile == NULL) {
  122. fprintf(stderr, "fgrep: can't open %s\n", optarg);
  123. exit(2);
  124. }
  125. continue;
  126. case 'l': /* list file names only */
  127. lflag++;
  128. continue;
  129. case 'n': /* line number prepended */
  130. nflag++;
  131. continue;
  132. case 's': /* return status only */
  133. sflag++;
  134. continue;
  135. case 'v': /* print lines not matching */
  136. vflag++;
  137. continue;
  138. case 'x': /* exact match only */
  139. xflag++;
  140. continue;
  141. case 'y': /* lower case in expression matches upper case too */
  142. yflag++;
  143. continue;
  144. case 'm': /* grep option selecting fgrep */
  145. continue;
  146. case '?':
  147. errflg++;
  148. }
  149. argc -= optind;
  150. if (errflg || ((argc <= 0) && !fflag && !eflag)) {
  151. fprintf(stderr, "usage: fgrep %s\n",usage);
  152. exit(2);
  153. }
  154. if ( !eflag && !fflag ) {
  155. argptr = argv[optind];
  156. optind++;
  157. argc--;
  158. }
  159. cgotofn();
  160. if (stringfile != NULL)
  161. fclose(stringfile);
  162. if (!xflag)
  163. cfail();
  164. #ifdef DEBUG
  165. if (dflag)
  166. display(tree);
  167. #endif
  168. numfiles = argc;
  169. argv = &argv[optind];
  170. if (argc <= 0) {
  171. if (lflag)
  172. exit(2); /* can't list name of standard input */
  173. execute((char *) NULL);
  174. }
  175. else
  176. while ( --argc >= 0 ) {
  177. execute(*argv);
  178. argv++;
  179. }
  180. exit (notfound? 2 : !anymatches? 1 : 0);
  181. }
  182. char *
  183. getstring(buf, len)
  184. char *buf;
  185. int len;
  186. /* Returns next string in buf, NULL if input is exhausted.
  187. * If xflag, \n precedes the null terminator.
  188. * Long strings (> len-2 chars) are silently truncated.
  189. */
  190. {
  191. register char *sp;
  192. register int c;
  193. char *lim;
  194. lim = &buf[len-2];
  195. sp = buf;
  196. do {
  197. if (stringfile != NULL) {
  198. if ((c = getc(stringfile)) == EOF)
  199. break;
  200. }
  201. else if ((c = *argptr++) == '\0') {
  202. argptr--; /* so next call on getstring sees the null */
  203. break;
  204. }
  205. if (sp < lim)
  206. *sp++ = c;
  207. } while (c != '\n');
  208. if (sp == buf && (!xflag || tree != NULL))
  209. /* true EOF, and empty pattern list processed if xflag */
  210. return(NULL);
  211. if (c == '\n' && xflag == 0)
  212. sp--;
  213. else if (c != '\n' && xflag == 1)
  214. *sp++ = '\n';
  215. *sp = '\0';
  216. return(buf);
  217. }
  218. cgotofn()
  219. {
  220. register struct words *t;
  221. register char *sp;
  222. register int c;
  223. char buf[BUFSIZ+2]; /* enough for newline and null + BUFSIZ chars */
  224. struct words *last, *walloc();
  225. tree = NULL;
  226. while ((sp = getstring(buf, sizeof(buf))) != NULL) {
  227. /* match prefix of buf against previous patterns */
  228. last = NULL;
  229. for (t = tree; (c = *sp++) != '\0' && t != NULL; last = t, t = t->succ) {
  230. while (t->key != c && t->link != NULL)
  231. t = t->link;
  232. if (t->key != c)
  233. break;
  234. }
  235. if (c != '\0' && (t != NULL || tree == NULL)) {
  236. /* first pattern (non-empty) or buf is neither initial substring
  237. * nor initial superstring of previous pattern; add new tail
  238. * to common prefix.
  239. */
  240. if (t == NULL)
  241. t = tree = walloc();
  242. else
  243. t = t->link = walloc();
  244. t->key = c;
  245. while ((c = *sp++) != '\0') {
  246. t = t->succ = walloc();
  247. t->key = c;
  248. }
  249. }
  250. else if (c == '\0')
  251. /* current pattern is initial substring of previous pattern */
  252. if (last != NULL)
  253. last->succ = NULL; /* delete suffix of longer string */
  254. else {
  255. tree = NULL; /* null string: matches all */
  256. return;
  257. }
  258. }
  259. }
  260. overflo()
  261. {
  262. fprintf(stderr, "fgrep: word list too large\n");
  263. exit(2);
  264. }
  265. struct words *
  266. walloc()
  267. {
  268. if (smax >= slim) {
  269. if ((smax = (struct words *) calloc(INCRSIZ, sizeof(struct words))) == NULL)
  270. overflo();
  271. slim = &smax[INCRSIZ];
  272. }
  273. return(smax++);
  274. }
  275. struct qpage *
  276. qalloc()
  277. {
  278. register struct qpage *qp;
  279. if ((qp = (struct qpage *) malloc(sizeof(struct qpage))) == NULL)
  280. overflo();
  281. qp->nextpage = NULL;
  282. return(qp);
  283. }
  284. cfail()
  285. /* Performs a breadth-first traversal of the tree, setting failure
  286. * links at each node above the first level. */
  287. {
  288. struct words **front, **rear;
  289. struct qpage *firstpage, *lastpage;
  290. register struct words *t, *pfail, *q;
  291. register char c;
  292. if (tree == NULL)
  293. return;
  294. firstpage = lastpage = qalloc();
  295. front = rear = firstpage->queue;
  296. *rear++ = tree;
  297. while (rear != front) {
  298. t = *front++;
  299. if (front >= &(firstpage->queue[QSIZE])) {
  300. free( (char *) firstpage);
  301. firstpage = firstpage->nextpage;
  302. front = firstpage->queue;
  303. }
  304. /* now set failure links for the children of t and its siblings,
  305. * scheduling their childrens' link setting at the same time.
  306. */
  307. do {
  308. if ((q = t->succ) != NULL) {
  309. pfail = t->fail;
  310. c = t->key;
  311. while (pfail != NULL && pfail->key != c)
  312. if (pfail->link != NULL)
  313. pfail = pfail->link;
  314. else
  315. pfail = pfail->fail;
  316. if (pfail == NULL)
  317. pfail = tree;
  318. else
  319. pfail = pfail->succ;
  320. if (pfail == NULL)
  321. /* current pattern (path from root) ends in a suffix
  322. * which is another complete pattern; delete any
  323. * remaining subtree.
  324. */
  325. t->succ = NULL;
  326. else {
  327. *rear++ = q;
  328. if (rear >= &(lastpage->queue[QSIZE])) {
  329. lastpage->nextpage = qalloc();
  330. lastpage = lastpage->nextpage;
  331. rear = lastpage->queue;
  332. }
  333. do
  334. q->fail = pfail;
  335. while ((q = q->link) != NULL);
  336. }
  337. }
  338. } while ((t = t->link) != NULL);
  339. }
  340. free( (char *) lastpage);
  341. }
  342. #define BUF2 (&buf[BUFSIZ])
  343. #define BUF3 (&buf[2*BUFSIZ])
  344. #define ENDBUF3 (&buf[3*BUFSIZ])
  345. execute(filename)
  346. char *filename;
  347. {
  348. register char *p;
  349. register struct words *t;
  350. register int c;
  351. long lcount, totalccount, nmatches;
  352. int ccount, pcount, f;
  353. int found, finished, matched;
  354. char buf[3*BUFSIZ]; /* large buffer + auxiliary buffer */
  355. char *start, *lim, *oldp;
  356. if (filename != NULL) {
  357. if ((f = open(filename, 0)) < 0) {
  358. if(!sflag)
  359. fprintf(stderr, "fgrep: can't open %s\n", filename);
  360. else
  361. exit(2);
  362. notfound = 1;
  363. return;
  364. }
  365. }
  366. else
  367. f = fileno(stdin);
  368. lcount = 0L; totalccount = 0L; nmatches = 0L;
  369. ccount = 0; p = buf;
  370. do { /* process one line */
  371. t = tree; lcount++;
  372. found = finished = (t == NULL);
  373. if ((start = p) == BUF3)
  374. start = buf;
  375. do { /* process one character */
  376. if (ccount-- == 0) { /* buffer empty */
  377. if (p == BUF3)
  378. p = buf; /* wrap-around from second half of main buffer */
  379. if (p > BUF2) {
  380. if (start > p && start < BUF3)
  381. copyblock(&start, BUF3);
  382. ccount = read(f, p, (unsigned) (BUF3 - p));
  383. }
  384. else { /* in first half or at start of second half
  385. * of main buffer */
  386. if (start > p && start < &p[BUFSIZ])
  387. copyblock(&start, BUF3);
  388. ccount = read(f, p, BUFSIZ);
  389. }
  390. if (ccount <= 0) {
  391. c = EOF;
  392. break;
  393. }
  394. totalccount += ccount--;
  395. }
  396. c = *p++;
  397. if (!finished) {
  398. while (t != NULL) {
  399. while ( !(matched = match(t->key,c)) && t->link != NULL)
  400. /* try alternatives at this level */
  401. t = t->link;
  402. if (matched)
  403. break;
  404. t = t->fail; /* chain back in pattern */
  405. finished = xflag;
  406. }
  407. if (t == NULL) /* all alternatives tried, restart from
  408. * first pattern character */
  409. t = tree;
  410. else if ((t = t->succ) == NULL)
  411. found = finished = 1;
  412. }
  413. } while (c != '\n');
  414. if (vflag)
  415. found = !found;
  416. if (found && p != start) {
  417. anymatches++;
  418. if (sflag)
  419. exit(0); /* matched: no need to look further */
  420. if (cflag)
  421. nmatches++;
  422. else if (lflag) {
  423. printf("%s\n", filename);
  424. close(f);
  425. return;
  426. }
  427. else {
  428. if (numfiles > 1)
  429. printf("%s:", filename);
  430. if (bflag)
  431. printf("%ld:", (totalccount - ccount) / BLKSIZE);
  432. if (nflag)
  433. printf("%ld:", lcount);
  434. oldp = p; pcount = 0;
  435. if (start < p) /* no wrap-around or long line */
  436. lim = p;
  437. else if (start == BUF3) /* saved long line */
  438. lim = ENDBUF3;
  439. else { /* print tail of BUF2 first */
  440. for (p = start; p < BUF3; p++)
  441. putchar(*p);
  442. pcount = p - start;
  443. lim = oldp; start = buf;
  444. }
  445. if (pcount + (lim-start) > BUFSIZ)
  446. /* limit all lines to BUFSIZ chars to be consistent */
  447. lim = &start[BUFSIZ - pcount];
  448. for (p = start; p < lim; p++)
  449. putchar(*p);
  450. if (p[-1] != '\n') /* line truncated or non-terminated last line */
  451. putchar('\n');
  452. p = oldp;
  453. }
  454. }
  455. } while (c != EOF);
  456. close(f);
  457. if (cflag) {
  458. if (numfiles > 1)
  459. printf("%s:", filename);
  460. printf("%ld\n", nmatches);
  461. }
  462. }
  463. copyblock (startp, dest)
  464. char **startp, *dest;
  465. /* copy BUFSIZ chars from *startp to dest; reset *startp to dest */
  466. {
  467. register char *sp, *dp;
  468. char *lim;
  469. lim = *startp + BUFSIZ; dp = dest;
  470. for (sp = *startp; sp < lim; sp++)
  471. *dp++ = *sp;
  472. *startp = dest;
  473. }
  474. #ifdef DEBUG
  475. struct qpage *firstpage, *lastpage;
  476. struct words **nextfree;
  477. display(root)
  478. struct words *root;
  479. {
  480. if (root != NULL) {
  481. firstpage = lastpage = qalloc();
  482. nextfree = lastpage->queue;
  483. printkeys(root, 0);
  484. if (!xflag)
  485. printfaillinks();
  486. do
  487. free( (char *) firstpage);
  488. while ((firstpage = firstpage->nextpage) != NULL);
  489. }
  490. }
  491. printkeys(t, level)
  492. register struct words *t;
  493. int level;
  494. {
  495. register int c, i;
  496. if ((c = t->key) == '\n')
  497. printf("\\n");
  498. else if (c < ' ' || c > '~')
  499. putchar('?');
  500. else {
  501. putchar(c);
  502. *nextfree++ = t;
  503. if (nextfree >= &(lastpage->queue[QSIZE])) {
  504. lastpage->nextpage = qalloc();
  505. lastpage = lastpage->nextpage;
  506. nextfree = lastpage->queue;
  507. }
  508. if (t->succ == NULL)
  509. putchar('\n');
  510. else
  511. printkeys(t->succ, level+1);
  512. if (t->link != NULL) {
  513. for (i=0; i < level; i++)
  514. printf("^\b|");
  515. printkeys(t->link, level);
  516. }
  517. }
  518. }
  519. tlookup(p, keyp, posp)
  520. struct words *p;
  521. int *keyp, *posp;
  522. {
  523. register struct words **wp;
  524. register struct qpage *curpage;
  525. int count;
  526. curpage = firstpage; wp = curpage->queue; count = 0;
  527. while (wp != nextfree && *wp != p)
  528. if (++wp >= &(curpage->queue[QSIZE])) {
  529. count += QSIZE;
  530. curpage = curpage->nextpage;
  531. wp = curpage->queue;
  532. }
  533. if (wp == nextfree)
  534. return(0); /* not found */
  535. else {
  536. *posp = count + 1 + (wp - curpage->queue);
  537. *keyp = (*wp)->key;
  538. return(1);
  539. }
  540. }
  541. printfaillinks()
  542. {
  543. register struct words **wp;
  544. register struct qpage *curpage;
  545. register int count;
  546. int key, pos;
  547. printf("num key fail link\n");
  548. curpage = firstpage; wp = curpage->queue; count = 0;
  549. while (wp != nextfree) {
  550. printf("%3d%3c ", ++count, (*wp)->key);
  551. if ((*wp)->fail == NULL)
  552. printf("NULL\n");
  553. else if (tlookup((*wp)->fail, &key, &pos))
  554. printf("%d (%c)\n");
  555. else
  556. printf("unknown\n");
  557. if (++wp >= &(curpage->queue[QSIZE])) {
  558. count += QSIZE;
  559. curpage = curpage->nextpage;
  560. wp = curpage->queue;
  561. }
  562. }
  563. }
  564. #endif DEBUG