peterlane
/
uhferret-gem


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
							#include "tokenreader.h"

/** 
 * This file is part of uhferret.
 *
 * Author::    Peter Lane
 * Copyright:: Copyright 2011, Peter Lane.
 * License::   GPLv3
 *
 * uhferret is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * uhferret is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
 */

TokenReader::TokenReader (std::istream & input)
	: _input (input),
	  _position (0),
	  _done (false)
{}

std::size_t TokenReader::GetToken (TokenSet & tokenset)
{
	return tokenset.GetIndexFor (_token.GetString ());
}

bool TokenReader::IsFinished () const 
{
	return _done;
}

int TokenReader::GetTokenStart () const
{
	return _token_start;
}

int TokenReader::GetTokenEnd () const
{
	return _token_start + _token.GetLength ();
}

// check if given character is a member of standard roman alphabet
bool WordReader::IsAlphabetChar (char ch)
{
	return isalpha (ch);
}

// this function checks if the input character is from a language
// representing words as single characters.  Currently, this works 
// only for Chinese.
bool WordReader::IsSingleCharWord (char ch)
{
	return (ch >= 0x3400 && ch < 0xa000); // check if Chinese
}

// WordReader identifies words as sequences of alphabetic characters
//  -- using IsSingleCharWord, WordReader also separates out words from
//     languages like Chinese which can represent a complete words as a 
//     single character
bool WordReader::ReadToken ()
{
	if (_done) return false;	// reading is done
	// step to first alphabetical character
	do
	{
		_input.get (_look);
		_position++;
	}
	while (!IsAlphabetChar (_look) && _input.good ());
	// check for finished
	if (!_input.good ())
	{
		_done = true;	// mark reading as done
		return false;	// return with no token read
	}
	// read in the alphabetical characters
	_token.Erase ();		// start a new token
	_token_start = _position-1;	// - 1 because first character is in _look
	if (IsSingleCharWord (_look))
	{
		_token.AddChar (_look);
	}
	else
	{
		do
		{
			_token.AddChar (tolower (_look)); // put everything into lower case
			_input.get (_look);
			_position++;
		}
		while (IsAlphabetChar (_look) && !IsSingleCharWord (_look) && _input.good ());
		_input.unget ();	// replace last character, as not part of token
		_position--;
	}
	// check for finished
	if (!_input.good ()) _done = true;	// mark reading as done

	return true;
}

bool CCodeReader::IsSymbol (char c)
{
	return ( c == '!' || c == '%' || c == '/' || c == '*' || c == '+' ||
		 c == '-' || c == '=' || c == '|' || c == ',' || c == '?' || 
		 c == '.' || c == '&' || c == '(' || c == ')' || c == '{' || 
		 c == '}' || c == '<' || c == '>' || c == ':' || c == ';' || 
		 c == '^' || c == '[' || c == ']' || c == '"' || c == '#' ||
		 c == '~' );
}

bool CCodeReader::IsSymbol (std::string token, char c)
{
  std::string candidate = token + c;
	return ( candidate == "!=" || candidate == "++" || 
		 candidate == "--" || candidate == "==" || 
		 candidate == ">=" || candidate == "<=" || 
		 candidate == "||" || candidate == "&&" || 
		 candidate == "+=" || candidate == "-=" || 
		 candidate == "*=" || candidate == "/=" || 
		 candidate == "%=" || candidate == "&=" || 
		 candidate == "|=" || candidate == "^=" || 
		 candidate == "::" || candidate == "->" || 
		 candidate == "//" || candidate == "<<" || 
		 candidate == ">>" || candidate == "##" || 
		 candidate == "/*" || candidate == "*/" || 
		 candidate == "/**" );
}

bool CCodeReader::ReadToken ()
{
	if (_done) return false;
	// step to first non-whitespace character
	do
	{
		_input.get (_look);
		_position++;
	}
	while (std::isspace (_look) && _input.good ());
	// check for finished
	if (!_input.good ())
	{
		_done = true;	// mark reading as done
		return false;	// return with no token read
	}
	// read in the token
	_token.Erase ();		// start a new token
	_token_start = _position-1;	// - 1 because first character is in _look
	// check for different cases -- note, precise syntax not important!
	if (IsSymbol (_look))
	{
		// read in a symbol
		do
		{
			_token.AddChar (_look);
			_input.get (_look);
			_position++;
		}
		while ((IsSymbol (_token.GetString (), _look)) && (_input.good ()));
	}
	else if (std::isdigit(_look) || _look == '.')
	{
		// read in a number
		do
		{
			_token.AddChar (_look);
			_input.get (_look);
			_position++;
		}
		while ((std::isdigit (_look) || _look == '.') && (_input.good ()));
	}
	else
	{ // assume we have characters for a variable or other name 
		do
		{
			_token.AddChar (_look);
			_input.get (_look);
			_position++;
		}
		while ((std::isalnum (_look) || _look == '_') && (_input.good ()));
	}
	_input.unget (); // replace last character, as not part of token
	_position--;
	// check for finished
	if (!_input.good ()) _done = true;	// mark reading as done

	return true;
}