peterlane
/
uhferret-gem


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
							#if !defined tokenreader_h 
#define tokenreader_h

/** 
 * This file is part of uhferret.
 *
 * Author::    Peter Lane
 * Copyright:: Copyright 2011, Peter Lane.
 * License::   GPLv3
 *
 * uhferret is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * uhferret is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <ctype.h> // gives tests for if characters are numbers, alphanumerics, etc
#include <istream>

#include "tokenset.h"

/** The TokenReader is the parent class of the different 'token-isers'
  * -- WordReader tokenises a document into strings of alphanumeric characters
  * -- CCodeReader tokenises a document into symbols matching a C-style language
  * The token reader is initialised with an input stream
  * -- GetToken is used to 'walk through' the document, one token at a time
  *    until IsFinished returns true.
  * -- the start and end points of the token can be retrieved using the given methods,
  *    and the string making up the token can be obtained by caller from the TokenSet
  */
class TokenReader
{
	public:
		TokenReader (std::istream & input);
		// return index of last read token
		std::size_t GetToken (TokenSet & tokenset); // retrieve current token identifier
		bool IsFinished () const;	// return true if end-of-file reached
		int GetTokenStart () const;	// return the start position of current token
		int GetTokenEnd () const;	// return the end position of current token
		// read token, return true if successful
		// -- user of class must provide this method
		virtual bool ReadToken () = 0;
	protected: // allow subclasses to access parameters
    std::istream & _input;   // the stream from which to read
		int 		_position; // current position in stream
		Token		_token;    // last token read
		int		_token_start;	// start position of last token read
		char		_look;	   // lookahead character
		bool		_done;	   // becomes true when stream is completed
};

// The WordReader separates its input stream into tokens, consisting of 
//     consecutive alphabetic characters 
//     -- every character is converted to lower case
class WordReader: public TokenReader
{
	public:
		WordReader (std::istream & input) : TokenReader (input) {}
		bool IsAlphabetChar (char ch);
		bool IsSingleCharWord (char ch);
		bool ReadToken ();
};

// The CCodeReader separates its input stream into tokens, looking for 
//     C-style tokens, numbers and symbols
class CCodeReader: public TokenReader
{
	public:
		CCodeReader (std::istream & input) : TokenReader (input) {}
		bool ReadToken ();
	private:
		bool IsSymbol (char c);
		bool IsSymbol (std::string token, char c);
};

#endif