1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586 |
- #if !defined tokenreader_h
- #define tokenreader_h
- /**
- * This file is part of uhferret.
- *
- * Author:: Peter Lane
- * Copyright:: Copyright 2011, Peter Lane.
- * License:: GPLv3
- *
- * uhferret is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * uhferret is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
- */
- #include <ctype.h> // gives tests for if characters are numbers, alphanumerics, etc
- #include <istream>
- #include "tokenset.h"
- /** The TokenReader is the parent class of the different 'token-isers'
- * -- WordReader tokenises a document into strings of alphanumeric characters
- * -- CCodeReader tokenises a document into symbols matching a C-style language
- * The token reader is initialised with an input stream
- * -- GetToken is used to 'walk through' the document, one token at a time
- * until IsFinished returns true.
- * -- the start and end points of the token can be retrieved using the given methods,
- * and the string making up the token can be obtained by caller from the TokenSet
- */
- class TokenReader
- {
- public:
- TokenReader (std::istream & input);
- // return index of last read token
- std::size_t GetToken (TokenSet & tokenset); // retrieve current token identifier
- bool IsFinished () const; // return true if end-of-file reached
- int GetTokenStart () const; // return the start position of current token
- int GetTokenEnd () const; // return the end position of current token
- // read token, return true if successful
- // -- user of class must provide this method
- virtual bool ReadToken () = 0;
- protected: // allow subclasses to access parameters
- std::istream & _input; // the stream from which to read
- int _position; // current position in stream
- Token _token; // last token read
- int _token_start; // start position of last token read
- char _look; // lookahead character
- bool _done; // becomes true when stream is completed
- };
- // The WordReader separates its input stream into tokens, consisting of
- // consecutive alphabetic characters
- // -- every character is converted to lower case
- class WordReader: public TokenReader
- {
- public:
- WordReader (std::istream & input) : TokenReader (input) {}
- bool IsAlphabetChar (char ch);
- bool IsSingleCharWord (char ch);
- bool ReadToken ();
- };
- // The CCodeReader separates its input stream into tokens, looking for
- // C-style tokens, numbers and symbols
- class CCodeReader: public TokenReader
- {
- public:
- CCodeReader (std::istream & input) : TokenReader (input) {}
- bool ReadToken ();
- private:
- bool IsSymbol (char c);
- bool IsSymbol (std::string token, char c);
- };
- #endif
|