1 |
- /* Token-reader for Bison's input parser,
Copyright (C) 1984, 1986, 1989 Free Software Foundation, Inc.
This file is part of Bison, the GNU Compiler Compiler.
Bison is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
Bison is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Bison; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
/*
lex() is the entry point. It is called from reader.c.
It returns one of the token-type codes defined in lex.h.
When an identifier is seen, the code IDENTIFIER is returned
and the name is looked up in the symbol table using symtab.c;
symval is set to a pointer to the entry found. */
#include <stdio.h>
#include <ctype.h>
#include "system.h"
#include "files.h"
#include "symtab.h"
#include "lex.h"
#include "new.h"
extern int lineno;
extern int translations;
int parse_percent_token();
#ifdef THINK_C
extern void fatals(char *fmt, ...);
#else
extern void fatals();
#endif /* THINK_C */
extern void fatal();
/* Buffer for storing the current token. */
char *token_buffer;
/* Allocated size of token_buffer, not including space for terminator. */
static int maxtoken;
bucket *symval;
int numval;
static int unlexed; /* these two describe a token to be reread */
static bucket *unlexed_symval; /* by the next call to lex */
void
init_lex()
{
maxtoken = 100;
token_buffer = NEW2 (maxtoken + 1, char);
unlexed = -1;
}
static char *
grow_token_buffer (p)
char *p;
{
int offset = p - token_buffer;
maxtoken *= 2;
token_buffer = (char *) realloc(token_buffer, maxtoken + 1);
if (token_buffer == 0)
fatal("virtual memory exhausted");
return token_buffer + offset;
}
int
skip_white_space()
{
register int c;
register int inside;
c = getc(finput);
for (;;)
{
int cplus_comment;
switch (c)
{
case '/':
c = getc(finput);
if (c != '*' && c != '/')
fatals("unexpected `/%c' found",c);
cplus_comment = (c == '/');
c = getc(finput);
inside = 1;
while (inside)
{
if (!cplus_comment && c == '*')
{
while (c == '*')
c = getc(finput);
if (c == '/')
{
inside = 0;
c = getc(finput);
}
}
else if (c == '\n')
{
lineno++;
if (cplus_comment)
inside = 0;
else
c = getc(finput);
}
else if (c == EOF)
fatal("unterminated comment");
else
c = getc(finput);
}
break;
case '\n':
lineno++;
case ' ':
case '\t':
case '\f':
c = getc(finput);
break;
default:
return (c);
}
}
}
void
unlex(token)
int token;
{
unlexed = token;
unlexed_symval = symval;
}
int
lex()
{
register int c;
register char *p;
if (unlexed >= 0)
{
symval = unlexed_symval;
c = unlexed;
unlexed = -1;
return (c);
}
c = skip_white_space();
switch (c)
{
case EOF:
return (ENDFILE);
case 'A': case 'B': case 'C': case 'D': case 'E':
case 'F': case 'G': case 'H': case 'I': case 'J':
case 'K': case 'L': case 'M': case 'N': case 'O':
case 'P': case 'Q': case 'R': case 'S': case 'T':
case 'U': case 'V': case 'W': case 'X': case 'Y':
case 'Z':
case 'a': case 'b': case 'c': case 'd': case 'e':
case 'f': case 'g': case 'h': case 'i': case 'j':
case 'k': case 'l': case 'm': case 'n': case 'o':
case 'p': case 'q': case 'r': case 's': case 't':
case 'u': case 'v': case 'w': case 'x': case 'y':
case 'z':
case '.': case '_':
p = token_buffer;
while (isalnum(c) || c == '_' || c == '.')
{
if (p == token_buffer + maxtoken)
p = grow_token_buffer(p);
*p++ = c;
c = getc(finput);
}
*p = 0;
ungetc(c, finput);
symval = getsym(token_buffer);
return (IDENTIFIER);
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
{
numval = 0;
while (isdigit(c))
{
numval = numval*10 + c - '0';
c = getc(finput);
}
ungetc(c, finput);
return (NUMBER);
}
case '\'':
translations = -1;
/* parse the literal token and compute character code in code */
c = getc(finput);
{
register int code = 0;
if (c == '\\')
{
c = getc(finput);
if (c <= '7' && c >= '0')
{
while (c <= '7' && c >= '0')
{
code = (code * 8) + (c - '0');
c = getc(finput);
if (code >= 256 || code < 0)
fatals("malformatted literal token `\\%03o'", code);
}
}
else
{
if (c == 't')
code = '\t';
else if (c == 'n')
code = '\n';
else if (c == 'a')
code = '\007';
else if (c == 'r')
code = '\r';
else if (c == 'f')
code = '\f';
else if (c == 'b')
code = '\b';
else if (c == 'v')
code = 013;
else if (c == 'x')
{
c = getc(finput);
while ((c <= '9' && c >= '0')
|| (c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z'))
{
code *= 16;
if (c <= '9' && c >= '0')
code += c - '0';
else if (c >= 'a' && c <= 'z')
code += c - 'a' + 10;
else if (c >= 'A' && c <= 'Z')
code += c - 'A' + 10;
if (code >= 256 || code<0)/* JF this said if(c>=128) */
fatals("malformatted literal token `\\x%x'",code);
c = getc(finput);
}
ungetc(c, finput);
}
else if (c == '\\')
code = '\\';
else if (c == '\'')
code = '\'';
else if (c == '\"') /* JF this is a good idea */
code = '\"';
else
{
if (c >= 040 && c <= 0177)
fatals ("unknown escape sequence `\\%c'", c);
else
fatals ("unknown escape sequence: `\\' followed by char code 0x%x", c);
}
c = getc(finput);
}
}
else
{
code = c;
c = getc(finput);
}
if (c != '\'')
fatal("multicharacter literal tokens not supported");
/* now fill token_buffer with the canonical name for this character
as a literal token. Do not use what the user typed,
so that '\012' and '\n' can be interchangeable. */
p = token_buffer;
*p++ = '\'';
if (code == '\\')
{
*p++ = '\\';
*p++ = '\\';
}
else if (code == '\'')
{
*p++ = '\\';
*p++ = '\'';
}
else if (code >= 040 && code != 0177)
*p++ = code;
else if (code == '\t')
{
*p++ = '\\';
*p++ = 't';
}
else if (code == '\n')
{
*p++ = '\\';
*p++ = 'n';
}
else if (code == '\r')
{
*p++ = '\\';
*p++ = 'r';
}
else if (code == '\v')
{
*p++ = '\\';
*p++ = 'v';
}
else if (code == '\b')
{
*p++ = '\\';
*p++ = 'b';
}
else if (code == '\f')
{
*p++ = '\\';
*p++ = 'f';
}
else
{
*p++ = code / 0100 + '0';
*p++ = ((code / 010) & 07) + '0';
*p++ = (code & 07) + '0';
}
*p++ = '\'';
*p = 0;
symval = getsym(token_buffer);
symval->class = STOKEN;
if (! symval->user_token_number)
symval->user_token_number = code;
return (IDENTIFIER);
}
case ',':
return (COMMA);
case ':':
return (COLON);
case ';':
return (SEMICOLON);
case '|':
return (BAR);
case '{':
return (LEFT_CURLY);
case '=':
do
{
c = getc(finput);
if (c == '\n') lineno++;
}
while(c==' ' || c=='\n' || c=='\t');
if (c == '{')
return(LEFT_CURLY);
else
{
ungetc(c, finput);
return(ILLEGAL);
}
case '<':
p = token_buffer;
c = getc(finput);
while (c != '>')
{
if (c == '\n' || c == EOF)
fatal("unterminated type name");
if (p == token_buffer + maxtoken)
p = grow_token_buffer(p);
*p++ = c;
c = getc(finput);
}
*p = 0;
return (TYPENAME);
case '%':
return (parse_percent_token());
default:
return (ILLEGAL);
}
}
/* parse a token which starts with %. Assumes the % has already been read and discarded. */
int
parse_percent_token ()
{
register int c;
register char *p;
p = token_buffer;
c = getc(finput);
switch (c)
{
case '%':
return (TWO_PERCENTS);
case '{':
return (PERCENT_LEFT_CURLY);
case '<':
return (LEFT);
case '>':
return (RIGHT);
case '2':
return (NONASSOC);
case '0':
return (TOKEN);
case '=':
return (PREC);
}
if (!isalpha(c))
return (ILLEGAL);
while (isalpha(c) || c == '_')
{
if (p == token_buffer + maxtoken)
p = grow_token_buffer(p);
*p++ = c;
c = getc(finput);
}
ungetc(c, finput);
*p = 0;
if (strcmp(token_buffer, "token") == 0
||
strcmp(token_buffer, "term") == 0)
return (TOKEN);
else if (strcmp(token_buffer, "nterm") == 0)
return (NTERM);
else if (strcmp(token_buffer, "type") == 0)
return (TYPE);
else if (strcmp(token_buffer, "guard") == 0)
return (GUARD);
else if (strcmp(token_buffer, "union") == 0)
return (UNION);
else if (strcmp(token_buffer, "expect") == 0)
return (EXPECT);
else if (strcmp(token_buffer, "start") == 0)
return (START);
else if (strcmp(token_buffer, "left") == 0)
return (LEFT);
else if (strcmp(token_buffer, "right") == 0)
return (RIGHT);
else if (strcmp(token_buffer, "nonassoc") == 0
||
strcmp(token_buffer, "binary") == 0)
return (NONASSOC);
else if (strcmp(token_buffer, "semantic_parser") == 0)
return (SEMANTIC_PARSER);
else if (strcmp(token_buffer, "pure_parser") == 0)
return (PURE_PARSER);
else if (strcmp(token_buffer, "prec") == 0)
return (PREC);
else return (ILLEGAL);
}
|