tokenreader.cpp 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. #include "tokenreader.h"
  2. /**
  3. * This file is part of uhferret.
  4. *
  5. * Author:: Peter Lane
  6. * Copyright:: Copyright 2011, Peter Lane.
  7. * License:: GPLv3
  8. *
  9. * uhferret is free software: you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation, either version 3 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * uhferret is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU General Public License
  20. * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
  21. */
  22. TokenReader::TokenReader (std::istream & input)
  23. : _input (input),
  24. _position (0),
  25. _done (false)
  26. {}
  27. std::size_t TokenReader::GetToken (TokenSet & tokenset)
  28. {
  29. return tokenset.GetIndexFor (_token.GetString ());
  30. }
  31. bool TokenReader::IsFinished () const
  32. {
  33. return _done;
  34. }
  35. int TokenReader::GetTokenStart () const
  36. {
  37. return _token_start;
  38. }
  39. int TokenReader::GetTokenEnd () const
  40. {
  41. return _token_start + _token.GetLength ();
  42. }
  43. // check if given character is a member of standard roman alphabet
  44. bool WordReader::IsAlphabetChar (char ch)
  45. {
  46. return isalpha (ch);
  47. }
  48. // this function checks if the input character is from a language
  49. // representing words as single characters. Currently, this works
  50. // only for Chinese.
  51. bool WordReader::IsSingleCharWord (char ch)
  52. {
  53. return (ch >= 0x3400 && ch < 0xa000); // check if Chinese
  54. }
  55. // WordReader identifies words as sequences of alphabetic characters
  56. // -- using IsSingleCharWord, WordReader also separates out words from
  57. // languages like Chinese which can represent a complete words as a
  58. // single character
  59. bool WordReader::ReadToken ()
  60. {
  61. if (_done) return false; // reading is done
  62. // step to first alphabetical character
  63. do
  64. {
  65. _input.get (_look);
  66. _position++;
  67. }
  68. while (!IsAlphabetChar (_look) && _input.good ());
  69. // check for finished
  70. if (!_input.good ())
  71. {
  72. _done = true; // mark reading as done
  73. return false; // return with no token read
  74. }
  75. // read in the alphabetical characters
  76. _token.Erase (); // start a new token
  77. _token_start = _position-1; // - 1 because first character is in _look
  78. if (IsSingleCharWord (_look))
  79. {
  80. _token.AddChar (_look);
  81. }
  82. else
  83. {
  84. do
  85. {
  86. _token.AddChar (tolower (_look)); // put everything into lower case
  87. _input.get (_look);
  88. _position++;
  89. }
  90. while (IsAlphabetChar (_look) && !IsSingleCharWord (_look) && _input.good ());
  91. _input.unget (); // replace last character, as not part of token
  92. _position--;
  93. }
  94. // check for finished
  95. if (!_input.good ()) _done = true; // mark reading as done
  96. return true;
  97. }
  98. bool CCodeReader::IsSymbol (char c)
  99. {
  100. return ( c == '!' || c == '%' || c == '/' || c == '*' || c == '+' ||
  101. c == '-' || c == '=' || c == '|' || c == ',' || c == '?' ||
  102. c == '.' || c == '&' || c == '(' || c == ')' || c == '{' ||
  103. c == '}' || c == '<' || c == '>' || c == ':' || c == ';' ||
  104. c == '^' || c == '[' || c == ']' || c == '"' || c == '#' ||
  105. c == '~' );
  106. }
  107. bool CCodeReader::IsSymbol (std::string token, char c)
  108. {
  109. std::string candidate = token + c;
  110. return ( candidate == "!=" || candidate == "++" ||
  111. candidate == "--" || candidate == "==" ||
  112. candidate == ">=" || candidate == "<=" ||
  113. candidate == "||" || candidate == "&&" ||
  114. candidate == "+=" || candidate == "-=" ||
  115. candidate == "*=" || candidate == "/=" ||
  116. candidate == "%=" || candidate == "&=" ||
  117. candidate == "|=" || candidate == "^=" ||
  118. candidate == "::" || candidate == "->" ||
  119. candidate == "//" || candidate == "<<" ||
  120. candidate == ">>" || candidate == "##" ||
  121. candidate == "/*" || candidate == "*/" ||
  122. candidate == "/**" );
  123. }
  124. bool CCodeReader::ReadToken ()
  125. {
  126. if (_done) return false;
  127. // step to first non-whitespace character
  128. do
  129. {
  130. _input.get (_look);
  131. _position++;
  132. }
  133. while (std::isspace (_look) && _input.good ());
  134. // check for finished
  135. if (!_input.good ())
  136. {
  137. _done = true; // mark reading as done
  138. return false; // return with no token read
  139. }
  140. // read in the token
  141. _token.Erase (); // start a new token
  142. _token_start = _position-1; // - 1 because first character is in _look
  143. // check for different cases -- note, precise syntax not important!
  144. if (IsSymbol (_look))
  145. {
  146. // read in a symbol
  147. do
  148. {
  149. _token.AddChar (_look);
  150. _input.get (_look);
  151. _position++;
  152. }
  153. while ((IsSymbol (_token.GetString (), _look)) && (_input.good ()));
  154. }
  155. else if (std::isdigit(_look) || _look == '.')
  156. {
  157. // read in a number
  158. do
  159. {
  160. _token.AddChar (_look);
  161. _input.get (_look);
  162. _position++;
  163. }
  164. while ((std::isdigit (_look) || _look == '.') && (_input.good ()));
  165. }
  166. else
  167. { // assume we have characters for a variable or other name
  168. do
  169. {
  170. _token.AddChar (_look);
  171. _input.get (_look);
  172. _position++;
  173. }
  174. while ((std::isalnum (_look) || _look == '_') && (_input.good ()));
  175. }
  176. _input.unget (); // replace last character, as not part of token
  177. _position--;
  178. // check for finished
  179. if (!_input.good ()) _done = true; // mark reading as done
  180. return true;
  181. }