tokenreader.h 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. #if !defined tokenreader_h
  2. #define tokenreader_h
  3. /**
  4. * This file is part of uhferret.
  5. *
  6. * Author:: Peter Lane
  7. * Copyright:: Copyright 2011, Peter Lane.
  8. * License:: GPLv3
  9. *
  10. * uhferret is free software: you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License as published by
  12. * the Free Software Foundation, either version 3 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * uhferret is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU General Public License
  21. * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
  22. */
  23. #include <ctype.h> // gives tests for if characters are numbers, alphanumerics, etc
  24. #include <istream>
  25. #include "tokenset.h"
  26. /** The TokenReader is the parent class of the different 'token-isers'
  27. * -- WordReader tokenises a document into strings of alphanumeric characters
  28. * -- CCodeReader tokenises a document into symbols matching a C-style language
  29. * The token reader is initialised with an input stream
  30. * -- GetToken is used to 'walk through' the document, one token at a time
  31. * until IsFinished returns true.
  32. * -- the start and end points of the token can be retrieved using the given methods,
  33. * and the string making up the token can be obtained by caller from the TokenSet
  34. */
  35. class TokenReader
  36. {
  37. public:
  38. TokenReader (std::istream & input);
  39. // return index of last read token
  40. std::size_t GetToken (TokenSet & tokenset); // retrieve current token identifier
  41. bool IsFinished () const; // return true if end-of-file reached
  42. int GetTokenStart () const; // return the start position of current token
  43. int GetTokenEnd () const; // return the end position of current token
  44. // read token, return true if successful
  45. // -- user of class must provide this method
  46. virtual bool ReadToken () = 0;
  47. protected: // allow subclasses to access parameters
  48. std::istream & _input; // the stream from which to read
  49. int _position; // current position in stream
  50. Token _token; // last token read
  51. int _token_start; // start position of last token read
  52. char _look; // lookahead character
  53. bool _done; // becomes true when stream is completed
  54. };
  55. // The WordReader separates its input stream into tokens, consisting of
  56. // consecutive alphabetic characters
  57. // -- every character is converted to lower case
  58. class WordReader: public TokenReader
  59. {
  60. public:
  61. WordReader (std::istream & input) : TokenReader (input) {}
  62. bool IsAlphabetChar (char ch);
  63. bool IsSingleCharWord (char ch);
  64. bool ReadToken ();
  65. };
  66. // The CCodeReader separates its input stream into tokens, looking for
  67. // C-style tokens, numbers and symbols
  68. class CCodeReader: public TokenReader
  69. {
  70. public:
  71. CCodeReader (std::istream & input) : TokenReader (input) {}
  72. bool ReadToken ();
  73. private:
  74. bool IsSymbol (char c);
  75. bool IsSymbol (std::string token, char c);
  76. };
  77. #endif