123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197 |
- #include "tokenreader.h"
- /**
- * This file is part of uhferret.
- *
- * Author:: Peter Lane
- * Copyright:: Copyright 2011, Peter Lane.
- * License:: GPLv3
- *
- * uhferret is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * uhferret is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
- */
- TokenReader::TokenReader (std::istream & input)
- : _input (input),
- _position (0),
- _done (false)
- {}
- std::size_t TokenReader::GetToken (TokenSet & tokenset)
- {
- return tokenset.GetIndexFor (_token.GetString ());
- }
- bool TokenReader::IsFinished () const
- {
- return _done;
- }
- int TokenReader::GetTokenStart () const
- {
- return _token_start;
- }
- int TokenReader::GetTokenEnd () const
- {
- return _token_start + _token.GetLength ();
- }
- // check if given character is a member of standard roman alphabet
- bool WordReader::IsAlphabetChar (char ch)
- {
- return isalpha (ch);
- }
- // this function checks if the input character is from a language
- // representing words as single characters. Currently, this works
- // only for Chinese.
- bool WordReader::IsSingleCharWord (char ch)
- {
- return (ch >= 0x3400 && ch < 0xa000); // check if Chinese
- }
- // WordReader identifies words as sequences of alphabetic characters
- // -- using IsSingleCharWord, WordReader also separates out words from
- // languages like Chinese which can represent a complete words as a
- // single character
- bool WordReader::ReadToken ()
- {
- if (_done) return false; // reading is done
- // step to first alphabetical character
- do
- {
- _input.get (_look);
- _position++;
- }
- while (!IsAlphabetChar (_look) && _input.good ());
- // check for finished
- if (!_input.good ())
- {
- _done = true; // mark reading as done
- return false; // return with no token read
- }
- // read in the alphabetical characters
- _token.Erase (); // start a new token
- _token_start = _position-1; // - 1 because first character is in _look
- if (IsSingleCharWord (_look))
- {
- _token.AddChar (_look);
- }
- else
- {
- do
- {
- _token.AddChar (tolower (_look)); // put everything into lower case
- _input.get (_look);
- _position++;
- }
- while (IsAlphabetChar (_look) && !IsSingleCharWord (_look) && _input.good ());
- _input.unget (); // replace last character, as not part of token
- _position--;
- }
- // check for finished
- if (!_input.good ()) _done = true; // mark reading as done
- return true;
- }
- bool CCodeReader::IsSymbol (char c)
- {
- return ( c == '!' || c == '%' || c == '/' || c == '*' || c == '+' ||
- c == '-' || c == '=' || c == '|' || c == ',' || c == '?' ||
- c == '.' || c == '&' || c == '(' || c == ')' || c == '{' ||
- c == '}' || c == '<' || c == '>' || c == ':' || c == ';' ||
- c == '^' || c == '[' || c == ']' || c == '"' || c == '#' ||
- c == '~' );
- }
- bool CCodeReader::IsSymbol (std::string token, char c)
- {
- std::string candidate = token + c;
- return ( candidate == "!=" || candidate == "++" ||
- candidate == "--" || candidate == "==" ||
- candidate == ">=" || candidate == "<=" ||
- candidate == "||" || candidate == "&&" ||
- candidate == "+=" || candidate == "-=" ||
- candidate == "*=" || candidate == "/=" ||
- candidate == "%=" || candidate == "&=" ||
- candidate == "|=" || candidate == "^=" ||
- candidate == "::" || candidate == "->" ||
- candidate == "//" || candidate == "<<" ||
- candidate == ">>" || candidate == "##" ||
- candidate == "/*" || candidate == "*/" ||
- candidate == "/**" );
- }
- bool CCodeReader::ReadToken ()
- {
- if (_done) return false;
- // step to first non-whitespace character
- do
- {
- _input.get (_look);
- _position++;
- }
- while (std::isspace (_look) && _input.good ());
- // check for finished
- if (!_input.good ())
- {
- _done = true; // mark reading as done
- return false; // return with no token read
- }
- // read in the token
- _token.Erase (); // start a new token
- _token_start = _position-1; // - 1 because first character is in _look
- // check for different cases -- note, precise syntax not important!
- if (IsSymbol (_look))
- {
- // read in a symbol
- do
- {
- _token.AddChar (_look);
- _input.get (_look);
- _position++;
- }
- while ((IsSymbol (_token.GetString (), _look)) && (_input.good ()));
- }
- else if (std::isdigit(_look) || _look == '.')
- {
- // read in a number
- do
- {
- _token.AddChar (_look);
- _input.get (_look);
- _position++;
- }
- while ((std::isdigit (_look) || _look == '.') && (_input.good ()));
- }
- else
- { // assume we have characters for a variable or other name
- do
- {
- _token.AddChar (_look);
- _input.get (_look);
- _position++;
- }
- while ((std::isalnum (_look) || _look == '_') && (_input.good ()));
- }
- _input.unget (); // replace last character, as not part of token
- _position--;
- // check for finished
- if (!_input.good ()) _done = true; // mark reading as done
- return true;
- }
|