gdscript_tokenizer.cpp 41 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567
  1. /**************************************************************************/
  2. /* gdscript_tokenizer.cpp */
  3. /**************************************************************************/
  4. /* This file is part of: */
  5. /* GODOT ENGINE */
  6. /* https://godotengine.org */
  7. /**************************************************************************/
  8. /* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
  9. /* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
  10. /* */
  11. /* Permission is hereby granted, free of charge, to any person obtaining */
  12. /* a copy of this software and associated documentation files (the */
  13. /* "Software"), to deal in the Software without restriction, including */
  14. /* without limitation the rights to use, copy, modify, merge, publish, */
  15. /* distribute, sublicense, and/or sell copies of the Software, and to */
  16. /* permit persons to whom the Software is furnished to do so, subject to */
  17. /* the following conditions: */
  18. /* */
  19. /* The above copyright notice and this permission notice shall be */
  20. /* included in all copies or substantial portions of the Software. */
  21. /* */
  22. /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
  23. /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
  24. /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
  25. /* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
  26. /* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
  27. /* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
  28. /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
  29. /**************************************************************************/
  30. #include "gdscript_tokenizer.h"
  31. #include "core/error/error_macros.h"
  32. #include "core/string/char_utils.h"
  33. #ifdef TOOLS_ENABLED
  34. #include "editor/editor_settings.h"
  35. #endif
  36. #ifdef DEBUG_ENABLED
  37. #include "servers/text_server.h"
  38. #endif
  39. static const char *token_names[] = {
  40. "Empty", // EMPTY,
  41. // Basic
  42. "Annotation", // ANNOTATION
  43. "Identifier", // IDENTIFIER,
  44. "Literal", // LITERAL,
  45. // Comparison
  46. "<", // LESS,
  47. "<=", // LESS_EQUAL,
  48. ">", // GREATER,
  49. ">=", // GREATER_EQUAL,
  50. "==", // EQUAL_EQUAL,
  51. "!=", // BANG_EQUAL,
  52. // Logical
  53. "and", // AND,
  54. "or", // OR,
  55. "not", // NOT,
  56. "&&", // AMPERSAND_AMPERSAND,
  57. "||", // PIPE_PIPE,
  58. "!", // BANG,
  59. // Bitwise
  60. "&", // AMPERSAND,
  61. "|", // PIPE,
  62. "~", // TILDE,
  63. "^", // CARET,
  64. "<<", // LESS_LESS,
  65. ">>", // GREATER_GREATER,
  66. // Math
  67. "+", // PLUS,
  68. "-", // MINUS,
  69. "*", // STAR,
  70. "**", // STAR_STAR,
  71. "/", // SLASH,
  72. "%", // PERCENT,
  73. // Assignment
  74. "=", // EQUAL,
  75. "+=", // PLUS_EQUAL,
  76. "-=", // MINUS_EQUAL,
  77. "*=", // STAR_EQUAL,
  78. "**=", // STAR_STAR_EQUAL,
  79. "/=", // SLASH_EQUAL,
  80. "%=", // PERCENT_EQUAL,
  81. "<<=", // LESS_LESS_EQUAL,
  82. ">>=", // GREATER_GREATER_EQUAL,
  83. "&=", // AMPERSAND_EQUAL,
  84. "|=", // PIPE_EQUAL,
  85. "^=", // CARET_EQUAL,
  86. // Control flow
  87. "if", // IF,
  88. "elif", // ELIF,
  89. "else", // ELSE,
  90. "for", // FOR,
  91. "while", // WHILE,
  92. "break", // BREAK,
  93. "continue", // CONTINUE,
  94. "pass", // PASS,
  95. "return", // RETURN,
  96. "match", // MATCH,
  97. // Keywords
  98. "as", // AS,
  99. "assert", // ASSERT,
  100. "await", // AWAIT,
  101. "breakpoint", // BREAKPOINT,
  102. "class", // CLASS,
  103. "class_name", // CLASS_NAME,
  104. "const", // CONST,
  105. "enum", // ENUM,
  106. "extends", // EXTENDS,
  107. "func", // FUNC,
  108. "in", // IN,
  109. "is", // IS,
  110. "namespace", // NAMESPACE
  111. "preload", // PRELOAD,
  112. "self", // SELF,
  113. "signal", // SIGNAL,
  114. "static", // STATIC,
  115. "super", // SUPER,
  116. "trait", // TRAIT,
  117. "var", // VAR,
  118. "void", // VOID,
  119. "yield", // YIELD,
  120. // Punctuation
  121. "[", // BRACKET_OPEN,
  122. "]", // BRACKET_CLOSE,
  123. "{", // BRACE_OPEN,
  124. "}", // BRACE_CLOSE,
  125. "(", // PARENTHESIS_OPEN,
  126. ")", // PARENTHESIS_CLOSE,
  127. ",", // COMMA,
  128. ";", // SEMICOLON,
  129. ".", // PERIOD,
  130. "..", // PERIOD_PERIOD,
  131. ":", // COLON,
  132. "$", // DOLLAR,
  133. "->", // FORWARD_ARROW,
  134. "_", // UNDERSCORE,
  135. // Whitespace
  136. "Newline", // NEWLINE,
  137. "Indent", // INDENT,
  138. "Dedent", // DEDENT,
  139. // Constants
  140. "PI", // CONST_PI,
  141. "TAU", // CONST_TAU,
  142. "INF", // CONST_INF,
  143. "NaN", // CONST_NAN,
  144. // Error message improvement
  145. "VCS conflict marker", // VCS_CONFLICT_MARKER,
  146. "`", // BACKTICK,
  147. "?", // QUESTION_MARK,
  148. // Special
  149. "Error", // ERROR,
  150. "End of file", // EOF,
  151. };
  152. // Avoid desync.
  153. static_assert(sizeof(token_names) / sizeof(token_names[0]) == GDScriptTokenizer::Token::TK_MAX, "Amount of token names don't match the amount of token types.");
  154. const char *GDScriptTokenizer::Token::get_name() const {
  155. ERR_FAIL_INDEX_V_MSG(type, TK_MAX, "<error>", "Using token type out of the enum.");
  156. return token_names[type];
  157. }
  158. bool GDScriptTokenizer::Token::is_identifier() const {
  159. // Note: Most keywords should not be recognized as identifiers.
  160. // These are only exceptions for stuff that already is on the engine's API.
  161. switch (type) {
  162. case IDENTIFIER:
  163. case MATCH: // Used in String.match().
  164. // Allow constants to be treated as regular identifiers.
  165. case CONST_PI:
  166. case CONST_INF:
  167. case CONST_NAN:
  168. case CONST_TAU:
  169. return true;
  170. default:
  171. return false;
  172. }
  173. }
  174. bool GDScriptTokenizer::Token::is_node_name() const {
  175. // This is meant to allow keywords with the $ notation, but not as general identifiers.
  176. switch (type) {
  177. case IDENTIFIER:
  178. case AND:
  179. case AS:
  180. case ASSERT:
  181. case AWAIT:
  182. case BREAK:
  183. case BREAKPOINT:
  184. case CLASS_NAME:
  185. case CLASS:
  186. case CONST:
  187. case CONST_PI:
  188. case CONST_INF:
  189. case CONST_NAN:
  190. case CONST_TAU:
  191. case CONTINUE:
  192. case ELIF:
  193. case ELSE:
  194. case ENUM:
  195. case EXTENDS:
  196. case FOR:
  197. case FUNC:
  198. case IF:
  199. case IN:
  200. case IS:
  201. case MATCH:
  202. case NAMESPACE:
  203. case NOT:
  204. case OR:
  205. case PASS:
  206. case PRELOAD:
  207. case RETURN:
  208. case SELF:
  209. case SIGNAL:
  210. case STATIC:
  211. case SUPER:
  212. case TRAIT:
  213. case UNDERSCORE:
  214. case VAR:
  215. case VOID:
  216. case WHILE:
  217. case YIELD:
  218. return true;
  219. default:
  220. return false;
  221. }
  222. }
  223. String GDScriptTokenizer::get_token_name(Token::Type p_token_type) {
  224. ERR_FAIL_INDEX_V_MSG(p_token_type, Token::TK_MAX, "<error>", "Using token type out of the enum.");
  225. return token_names[p_token_type];
  226. }
  227. void GDScriptTokenizer::set_source_code(const String &p_source_code) {
  228. source = p_source_code;
  229. if (source.is_empty()) {
  230. _source = U"";
  231. } else {
  232. _source = source.ptr();
  233. }
  234. _current = _source;
  235. line = 1;
  236. column = 1;
  237. length = p_source_code.length();
  238. position = 0;
  239. }
  240. void GDScriptTokenizer::set_cursor_position(int p_line, int p_column) {
  241. cursor_line = p_line;
  242. cursor_column = p_column;
  243. }
  244. void GDScriptTokenizer::set_multiline_mode(bool p_state) {
  245. multiline_mode = p_state;
  246. }
  247. void GDScriptTokenizer::push_expression_indented_block() {
  248. indent_stack_stack.push_back(indent_stack);
  249. }
  250. void GDScriptTokenizer::pop_expression_indented_block() {
  251. ERR_FAIL_COND(indent_stack_stack.size() == 0);
  252. indent_stack = indent_stack_stack.back()->get();
  253. indent_stack_stack.pop_back();
  254. }
  255. int GDScriptTokenizer::get_cursor_line() const {
  256. return cursor_line;
  257. }
  258. int GDScriptTokenizer::get_cursor_column() const {
  259. return cursor_column;
  260. }
  261. bool GDScriptTokenizer::is_past_cursor() const {
  262. if (line < cursor_line) {
  263. return false;
  264. }
  265. if (line > cursor_line) {
  266. return true;
  267. }
  268. if (column < cursor_column) {
  269. return false;
  270. }
  271. return true;
  272. }
  273. char32_t GDScriptTokenizer::_advance() {
  274. if (unlikely(_is_at_end())) {
  275. return '\0';
  276. }
  277. _current++;
  278. column++;
  279. position++;
  280. if (column > rightmost_column) {
  281. rightmost_column = column;
  282. }
  283. if (unlikely(_is_at_end())) {
  284. // Add extra newline even if it's not there, to satisfy the parser.
  285. newline(true);
  286. // Also add needed unindent.
  287. check_indent();
  288. }
  289. return _peek(-1);
  290. }
  291. void GDScriptTokenizer::push_paren(char32_t p_char) {
  292. paren_stack.push_back(p_char);
  293. }
  294. bool GDScriptTokenizer::pop_paren(char32_t p_expected) {
  295. if (paren_stack.is_empty()) {
  296. return false;
  297. }
  298. char32_t actual = paren_stack.back()->get();
  299. paren_stack.pop_back();
  300. return actual == p_expected;
  301. }
  302. GDScriptTokenizer::Token GDScriptTokenizer::pop_error() {
  303. Token error = error_stack.back()->get();
  304. error_stack.pop_back();
  305. return error;
  306. }
  307. GDScriptTokenizer::Token GDScriptTokenizer::make_token(Token::Type p_type) {
  308. Token token(p_type);
  309. token.start_line = start_line;
  310. token.end_line = line;
  311. token.start_column = start_column;
  312. token.end_column = column;
  313. token.leftmost_column = leftmost_column;
  314. token.rightmost_column = rightmost_column;
  315. token.source = String(_start, _current - _start);
  316. if (p_type != Token::ERROR && cursor_line > -1) {
  317. // Also count whitespace after token.
  318. int offset = 0;
  319. while (_peek(offset) == ' ' || _peek(offset) == '\t') {
  320. offset++;
  321. }
  322. int last_column = column + offset;
  323. // Check cursor position in token.
  324. if (start_line == line) {
  325. // Single line token.
  326. if (cursor_line == start_line && cursor_column >= start_column && cursor_column <= last_column) {
  327. token.cursor_position = cursor_column - start_column;
  328. if (cursor_column == start_column) {
  329. token.cursor_place = CURSOR_BEGINNING;
  330. } else if (cursor_column < column) {
  331. token.cursor_place = CURSOR_MIDDLE;
  332. } else {
  333. token.cursor_place = CURSOR_END;
  334. }
  335. }
  336. } else {
  337. // Multi line token.
  338. if (cursor_line == start_line && cursor_column >= start_column) {
  339. // Is in first line.
  340. token.cursor_position = cursor_column - start_column;
  341. if (cursor_column == start_column) {
  342. token.cursor_place = CURSOR_BEGINNING;
  343. } else {
  344. token.cursor_place = CURSOR_MIDDLE;
  345. }
  346. } else if (cursor_line == line && cursor_column <= last_column) {
  347. // Is in last line.
  348. token.cursor_position = cursor_column - start_column;
  349. if (cursor_column < column) {
  350. token.cursor_place = CURSOR_MIDDLE;
  351. } else {
  352. token.cursor_place = CURSOR_END;
  353. }
  354. } else if (cursor_line > start_line && cursor_line < line) {
  355. // Is in middle line.
  356. token.cursor_position = CURSOR_MIDDLE;
  357. }
  358. }
  359. }
  360. return token;
  361. }
  362. GDScriptTokenizer::Token GDScriptTokenizer::make_literal(const Variant &p_literal) {
  363. Token token = make_token(Token::LITERAL);
  364. token.literal = p_literal;
  365. return token;
  366. }
  367. GDScriptTokenizer::Token GDScriptTokenizer::make_identifier(const StringName &p_identifier) {
  368. Token identifier = make_token(Token::IDENTIFIER);
  369. identifier.literal = p_identifier;
  370. return identifier;
  371. }
  372. GDScriptTokenizer::Token GDScriptTokenizer::make_error(const String &p_message) {
  373. Token error = make_token(Token::ERROR);
  374. error.literal = p_message;
  375. return error;
  376. }
  377. void GDScriptTokenizer::push_error(const String &p_message) {
  378. Token error = make_error(p_message);
  379. error_stack.push_back(error);
  380. }
  381. void GDScriptTokenizer::push_error(const Token &p_error) {
  382. error_stack.push_back(p_error);
  383. }
  384. GDScriptTokenizer::Token GDScriptTokenizer::make_paren_error(char32_t p_paren) {
  385. if (paren_stack.is_empty()) {
  386. return make_error(vformat("Closing \"%c\" doesn't have an opening counterpart.", p_paren));
  387. }
  388. Token error = make_error(vformat("Closing \"%c\" doesn't match the opening \"%c\".", p_paren, paren_stack.back()->get()));
  389. paren_stack.pop_back(); // Remove opening one anyway.
  390. return error;
  391. }
  392. GDScriptTokenizer::Token GDScriptTokenizer::check_vcs_marker(char32_t p_test, Token::Type p_double_type) {
  393. const char32_t *next = _current + 1;
  394. int chars = 2; // Two already matched.
  395. // Test before consuming characters, since we don't want to consume more than needed.
  396. while (*next == p_test) {
  397. chars++;
  398. next++;
  399. }
  400. if (chars >= 7) {
  401. // It is a VCS conflict marker.
  402. while (chars > 1) {
  403. // Consume all characters (first was already consumed by scan()).
  404. _advance();
  405. chars--;
  406. }
  407. return make_token(Token::VCS_CONFLICT_MARKER);
  408. } else {
  409. // It is only a regular double character token, so we consume the second character.
  410. _advance();
  411. return make_token(p_double_type);
  412. }
  413. }
  414. GDScriptTokenizer::Token GDScriptTokenizer::annotation() {
  415. if (is_unicode_identifier_start(_peek())) {
  416. _advance(); // Consume start character.
  417. } else {
  418. push_error("Expected annotation identifier after \"@\".");
  419. }
  420. while (is_unicode_identifier_continue(_peek())) {
  421. // Consume all identifier characters.
  422. _advance();
  423. }
  424. Token annotation = make_token(Token::ANNOTATION);
  425. annotation.literal = StringName(annotation.source);
  426. return annotation;
  427. }
  428. #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \
  429. KEYWORD_GROUP('a') \
  430. KEYWORD("as", Token::AS) \
  431. KEYWORD("and", Token::AND) \
  432. KEYWORD("assert", Token::ASSERT) \
  433. KEYWORD("await", Token::AWAIT) \
  434. KEYWORD_GROUP('b') \
  435. KEYWORD("break", Token::BREAK) \
  436. KEYWORD("breakpoint", Token::BREAKPOINT) \
  437. KEYWORD_GROUP('c') \
  438. KEYWORD("class", Token::CLASS) \
  439. KEYWORD("class_name", Token::CLASS_NAME) \
  440. KEYWORD("const", Token::CONST) \
  441. KEYWORD("continue", Token::CONTINUE) \
  442. KEYWORD_GROUP('e') \
  443. KEYWORD("elif", Token::ELIF) \
  444. KEYWORD("else", Token::ELSE) \
  445. KEYWORD("enum", Token::ENUM) \
  446. KEYWORD("extends", Token::EXTENDS) \
  447. KEYWORD_GROUP('f') \
  448. KEYWORD("for", Token::FOR) \
  449. KEYWORD("func", Token::FUNC) \
  450. KEYWORD_GROUP('i') \
  451. KEYWORD("if", Token::IF) \
  452. KEYWORD("in", Token::IN) \
  453. KEYWORD("is", Token::IS) \
  454. KEYWORD_GROUP('m') \
  455. KEYWORD("match", Token::MATCH) \
  456. KEYWORD_GROUP('n') \
  457. KEYWORD("namespace", Token::NAMESPACE) \
  458. KEYWORD("not", Token::NOT) \
  459. KEYWORD_GROUP('o') \
  460. KEYWORD("or", Token::OR) \
  461. KEYWORD_GROUP('p') \
  462. KEYWORD("pass", Token::PASS) \
  463. KEYWORD("preload", Token::PRELOAD) \
  464. KEYWORD_GROUP('r') \
  465. KEYWORD("return", Token::RETURN) \
  466. KEYWORD_GROUP('s') \
  467. KEYWORD("self", Token::SELF) \
  468. KEYWORD("signal", Token::SIGNAL) \
  469. KEYWORD("static", Token::STATIC) \
  470. KEYWORD("super", Token::SUPER) \
  471. KEYWORD_GROUP('t') \
  472. KEYWORD("trait", Token::TRAIT) \
  473. KEYWORD_GROUP('v') \
  474. KEYWORD("var", Token::VAR) \
  475. KEYWORD("void", Token::VOID) \
  476. KEYWORD_GROUP('w') \
  477. KEYWORD("while", Token::WHILE) \
  478. KEYWORD_GROUP('y') \
  479. KEYWORD("yield", Token::YIELD) \
  480. KEYWORD_GROUP('I') \
  481. KEYWORD("INF", Token::CONST_INF) \
  482. KEYWORD_GROUP('N') \
  483. KEYWORD("NAN", Token::CONST_NAN) \
  484. KEYWORD_GROUP('P') \
  485. KEYWORD("PI", Token::CONST_PI) \
  486. KEYWORD_GROUP('T') \
  487. KEYWORD("TAU", Token::CONST_TAU)
  488. #define MIN_KEYWORD_LENGTH 2
  489. #define MAX_KEYWORD_LENGTH 10
  490. #ifdef DEBUG_ENABLED
  491. void GDScriptTokenizer::make_keyword_list() {
  492. #define KEYWORD_LINE(keyword, token_type) keyword,
  493. #define KEYWORD_GROUP_IGNORE(group)
  494. keyword_list = {
  495. KEYWORDS(KEYWORD_GROUP_IGNORE, KEYWORD_LINE)
  496. };
  497. #undef KEYWORD_LINE
  498. #undef KEYWORD_GROUP_IGNORE
  499. }
  500. #endif // DEBUG_ENABLED
  501. GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() {
  502. bool only_ascii = _peek(-1) < 128;
  503. // Consume all identifier characters.
  504. while (is_unicode_identifier_continue(_peek())) {
  505. char32_t c = _advance();
  506. only_ascii = only_ascii && c < 128;
  507. }
  508. int len = _current - _start;
  509. if (len == 1 && _peek(-1) == '_') {
  510. // Lone underscore.
  511. return make_token(Token::UNDERSCORE);
  512. }
  513. String name(_start, len);
  514. if (len < MIN_KEYWORD_LENGTH || len > MAX_KEYWORD_LENGTH) {
  515. // Cannot be a keyword, as the length doesn't match any.
  516. return make_identifier(name);
  517. }
  518. // Define some helper macros for the switch case.
  519. #define KEYWORD_GROUP_CASE(char) \
  520. break; \
  521. case char:
  522. #define KEYWORD(keyword, token_type) \
  523. { \
  524. const int keyword_length = sizeof(keyword) - 1; \
  525. static_assert(keyword_length <= MAX_KEYWORD_LENGTH, "There's a keyword longer than the defined maximum length"); \
  526. static_assert(keyword_length >= MIN_KEYWORD_LENGTH, "There's a keyword shorter than the defined minimum length"); \
  527. if (keyword_length == len && name == keyword) { \
  528. return make_token(token_type); \
  529. } \
  530. }
  531. // Find if it's a keyword.
  532. switch (_start[0]) {
  533. default:
  534. KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
  535. break;
  536. }
  537. // Check if it's a special literal
  538. if (len == 4) {
  539. if (name == "true") {
  540. return make_literal(true);
  541. } else if (name == "null") {
  542. return make_literal(Variant());
  543. }
  544. } else if (len == 5) {
  545. if (name == "false") {
  546. return make_literal(false);
  547. }
  548. }
  549. // Not a keyword, so must be an identifier.
  550. Token id = make_identifier(name);
  551. #ifdef DEBUG_ENABLED
  552. // Additional checks for identifiers but only in debug and if it's available in TextServer.
  553. if (!only_ascii && TS->has_feature(TextServer::FEATURE_UNICODE_SECURITY)) {
  554. int64_t confusable = TS->is_confusable(name, keyword_list);
  555. if (confusable >= 0) {
  556. push_error(vformat(R"(Identifier "%s" is visually similar to the GDScript keyword "%s" and thus not allowed.)", name, keyword_list[confusable]));
  557. }
  558. }
  559. #endif // DEBUG_ENABLED
  560. return id;
  561. #undef KEYWORD_GROUP_CASE
  562. #undef KEYWORD
  563. }
  564. #undef MAX_KEYWORD_LENGTH
  565. #undef MIN_KEYWORD_LENGTH
  566. #undef KEYWORDS
  567. void GDScriptTokenizer::newline(bool p_make_token) {
  568. // Don't overwrite previous newline, nor create if we want a line continuation.
  569. if (p_make_token && !pending_newline && !line_continuation) {
  570. Token newline(Token::NEWLINE);
  571. newline.start_line = line;
  572. newline.end_line = line;
  573. newline.start_column = column - 1;
  574. newline.end_column = column;
  575. newline.leftmost_column = newline.start_column;
  576. newline.rightmost_column = newline.end_column;
  577. pending_newline = true;
  578. last_newline = newline;
  579. }
  580. // Increment line/column counters.
  581. line++;
  582. column = 1;
  583. leftmost_column = 1;
  584. }
  585. GDScriptTokenizer::Token GDScriptTokenizer::number() {
  586. int base = 10;
  587. bool has_decimal = false;
  588. bool has_exponent = false;
  589. bool has_error = false;
  590. bool (*digit_check_func)(char32_t) = is_digit;
  591. if (_peek(-1) == '.') {
  592. has_decimal = true;
  593. } else if (_peek(-1) == '0') {
  594. if (_peek() == 'x') {
  595. // Hexadecimal.
  596. base = 16;
  597. digit_check_func = is_hex_digit;
  598. _advance();
  599. } else if (_peek() == 'b') {
  600. // Binary.
  601. base = 2;
  602. digit_check_func = is_binary_digit;
  603. _advance();
  604. }
  605. }
  606. // Allow '_' to be used in a number, for readability.
  607. bool previous_was_underscore = false;
  608. while (digit_check_func(_peek()) || is_underscore(_peek())) {
  609. if (is_underscore(_peek())) {
  610. if (previous_was_underscore) {
  611. Token error = make_error(R"(Only one underscore can be used as a numeric separator.)");
  612. error.start_column = column;
  613. error.leftmost_column = column;
  614. error.end_column = column + 1;
  615. error.rightmost_column = column + 1;
  616. push_error(error);
  617. }
  618. previous_was_underscore = true;
  619. } else {
  620. previous_was_underscore = false;
  621. }
  622. _advance();
  623. }
  624. // It might be a ".." token (instead of decimal point) so we check if it's not.
  625. if (_peek() == '.' && _peek(1) != '.') {
  626. if (base == 10 && !has_decimal) {
  627. has_decimal = true;
  628. } else if (base == 10) {
  629. Token error = make_error("Cannot use a decimal point twice in a number.");
  630. error.start_column = column;
  631. error.leftmost_column = column;
  632. error.end_column = column + 1;
  633. error.rightmost_column = column + 1;
  634. push_error(error);
  635. has_error = true;
  636. } else if (base == 16) {
  637. Token error = make_error("Cannot use a decimal point in a hexadecimal number.");
  638. error.start_column = column;
  639. error.leftmost_column = column;
  640. error.end_column = column + 1;
  641. error.rightmost_column = column + 1;
  642. push_error(error);
  643. has_error = true;
  644. } else {
  645. Token error = make_error("Cannot use a decimal point in a binary number.");
  646. error.start_column = column;
  647. error.leftmost_column = column;
  648. error.end_column = column + 1;
  649. error.rightmost_column = column + 1;
  650. push_error(error);
  651. has_error = true;
  652. }
  653. if (!has_error) {
  654. _advance();
  655. // Consume decimal digits.
  656. while (is_digit(_peek()) || is_underscore(_peek())) {
  657. _advance();
  658. }
  659. }
  660. }
  661. if (base == 10) {
  662. if (_peek() == 'e' || _peek() == 'E') {
  663. has_exponent = true;
  664. _advance();
  665. if (_peek() == '+' || _peek() == '-') {
  666. // Exponent sign.
  667. _advance();
  668. }
  669. // Consume exponent digits.
  670. if (!is_digit(_peek())) {
  671. Token error = make_error(R"(Expected exponent value after "e".)");
  672. error.start_column = column;
  673. error.leftmost_column = column;
  674. error.end_column = column + 1;
  675. error.rightmost_column = column + 1;
  676. push_error(error);
  677. }
  678. previous_was_underscore = false;
  679. while (is_digit(_peek()) || is_underscore(_peek())) {
  680. if (is_underscore(_peek())) {
  681. if (previous_was_underscore) {
  682. Token error = make_error(R"(Only one underscore can be used as a numeric separator.)");
  683. error.start_column = column;
  684. error.leftmost_column = column;
  685. error.end_column = column + 1;
  686. error.rightmost_column = column + 1;
  687. push_error(error);
  688. }
  689. previous_was_underscore = true;
  690. } else {
  691. previous_was_underscore = false;
  692. }
  693. _advance();
  694. }
  695. }
  696. }
  697. // Detect extra decimal point.
  698. if (!has_error && has_decimal && _peek() == '.' && _peek(1) != '.') {
  699. Token error = make_error("Cannot use a decimal point twice in a number.");
  700. error.start_column = column;
  701. error.leftmost_column = column;
  702. error.end_column = column + 1;
  703. error.rightmost_column = column + 1;
  704. push_error(error);
  705. has_error = true;
  706. } else if (is_unicode_identifier_start(_peek()) || is_unicode_identifier_continue(_peek())) {
  707. // Letter at the end of the number.
  708. push_error("Invalid numeric notation.");
  709. }
  710. // Create a string with the whole number.
  711. int len = _current - _start;
  712. String number = String(_start, len).replace("_", "");
  713. // Convert to the appropriate literal type.
  714. if (base == 16) {
  715. int64_t value = number.hex_to_int();
  716. return make_literal(value);
  717. } else if (base == 2) {
  718. int64_t value = number.bin_to_int();
  719. return make_literal(value);
  720. } else if (has_decimal || has_exponent) {
  721. double value = number.to_float();
  722. return make_literal(value);
  723. } else {
  724. int64_t value = number.to_int();
  725. return make_literal(value);
  726. }
  727. }
  728. GDScriptTokenizer::Token GDScriptTokenizer::string() {
  729. enum StringType {
  730. STRING_REGULAR,
  731. STRING_NAME,
  732. STRING_NODEPATH,
  733. };
  734. bool is_multiline = false;
  735. StringType type = STRING_REGULAR;
  736. if (_peek(-1) == '&') {
  737. type = STRING_NAME;
  738. _advance();
  739. } else if (_peek(-1) == '^') {
  740. type = STRING_NODEPATH;
  741. _advance();
  742. }
  743. char32_t quote_char = _peek(-1);
  744. if (_peek() == quote_char && _peek(1) == quote_char) {
  745. is_multiline = true;
  746. // Consume all quotes.
  747. _advance();
  748. _advance();
  749. }
  750. String result;
  751. char32_t prev = 0;
  752. int prev_pos = 0;
  753. for (;;) {
  754. // Consume actual string.
  755. if (_is_at_end()) {
  756. return make_error("Unterminated string.");
  757. }
  758. char32_t ch = _peek();
  759. if (ch == 0x200E || ch == 0x200F || (ch >= 0x202A && ch <= 0x202E) || (ch >= 0x2066 && ch <= 0x2069)) {
  760. Token error = make_error("Invisible text direction control character present in the string, escape it (\"\\u" + String::num_int64(ch, 16) + "\") to avoid confusion.");
  761. error.start_column = column;
  762. error.leftmost_column = error.start_column;
  763. error.end_column = column + 1;
  764. error.rightmost_column = error.end_column;
  765. push_error(error);
  766. }
  767. if (ch == '\\') {
  768. // Escape pattern.
  769. _advance();
  770. if (_is_at_end()) {
  771. return make_error("Unterminated string.");
  772. }
  773. // Grab escape character.
  774. char32_t code = _peek();
  775. _advance();
  776. if (_is_at_end()) {
  777. return make_error("Unterminated string.");
  778. }
  779. char32_t escaped = 0;
  780. bool valid_escape = true;
  781. switch (code) {
  782. case 'a':
  783. escaped = '\a';
  784. break;
  785. case 'b':
  786. escaped = '\b';
  787. break;
  788. case 'f':
  789. escaped = '\f';
  790. break;
  791. case 'n':
  792. escaped = '\n';
  793. break;
  794. case 'r':
  795. escaped = '\r';
  796. break;
  797. case 't':
  798. escaped = '\t';
  799. break;
  800. case 'v':
  801. escaped = '\v';
  802. break;
  803. case '\'':
  804. escaped = '\'';
  805. break;
  806. case '\"':
  807. escaped = '\"';
  808. break;
  809. case '\\':
  810. escaped = '\\';
  811. break;
  812. case 'U':
  813. case 'u': {
  814. // Hexadecimal sequence.
  815. int hex_len = (code == 'U') ? 6 : 4;
  816. for (int j = 0; j < hex_len; j++) {
  817. if (_is_at_end()) {
  818. return make_error("Unterminated string.");
  819. }
  820. char32_t digit = _peek();
  821. char32_t value = 0;
  822. if (is_digit(digit)) {
  823. value = digit - '0';
  824. } else if (digit >= 'a' && digit <= 'f') {
  825. value = digit - 'a';
  826. value += 10;
  827. } else if (digit >= 'A' && digit <= 'F') {
  828. value = digit - 'A';
  829. value += 10;
  830. } else {
  831. // Make error, but keep parsing the string.
  832. Token error = make_error("Invalid hexadecimal digit in unicode escape sequence.");
  833. error.start_column = column;
  834. error.leftmost_column = error.start_column;
  835. error.end_column = column + 1;
  836. error.rightmost_column = error.end_column;
  837. push_error(error);
  838. valid_escape = false;
  839. break;
  840. }
  841. escaped <<= 4;
  842. escaped |= value;
  843. _advance();
  844. }
  845. } break;
  846. case '\r':
  847. if (_peek() != '\n') {
  848. // Carriage return without newline in string. (???)
  849. // Just add it to the string and keep going.
  850. result += ch;
  851. _advance();
  852. break;
  853. }
  854. [[fallthrough]];
  855. case '\n':
  856. // Escaping newline.
  857. newline(false);
  858. valid_escape = false; // Don't add to the string.
  859. break;
  860. default:
  861. Token error = make_error("Invalid escape in string.");
  862. error.start_column = column - 2;
  863. error.leftmost_column = error.start_column;
  864. push_error(error);
  865. valid_escape = false;
  866. break;
  867. }
  868. // Parse UTF-16 pair.
  869. if (valid_escape) {
  870. if ((escaped & 0xfffffc00) == 0xd800) {
  871. if (prev == 0) {
  872. prev = escaped;
  873. prev_pos = column - 2;
  874. continue;
  875. } else {
  876. Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
  877. error.start_column = column - 2;
  878. error.leftmost_column = error.start_column;
  879. push_error(error);
  880. valid_escape = false;
  881. prev = 0;
  882. }
  883. } else if ((escaped & 0xfffffc00) == 0xdc00) {
  884. if (prev == 0) {
  885. Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate");
  886. error.start_column = column - 2;
  887. error.leftmost_column = error.start_column;
  888. push_error(error);
  889. valid_escape = false;
  890. } else {
  891. escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
  892. prev = 0;
  893. }
  894. }
  895. if (prev != 0) {
  896. Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
  897. error.start_column = prev_pos;
  898. error.leftmost_column = error.start_column;
  899. push_error(error);
  900. prev = 0;
  901. }
  902. }
  903. if (valid_escape) {
  904. result += escaped;
  905. }
  906. } else if (ch == quote_char) {
  907. if (prev != 0) {
  908. Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
  909. error.start_column = prev_pos;
  910. error.leftmost_column = error.start_column;
  911. push_error(error);
  912. prev = 0;
  913. }
  914. _advance();
  915. if (is_multiline) {
  916. if (_peek() == quote_char && _peek(1) == quote_char) {
  917. // Ended the multiline string. Consume all quotes.
  918. _advance();
  919. _advance();
  920. break;
  921. } else {
  922. // Not a multiline string termination, add consumed quote.
  923. result += quote_char;
  924. }
  925. } else {
  926. // Ended single-line string.
  927. break;
  928. }
  929. } else {
  930. if (prev != 0) {
  931. Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
  932. error.start_column = prev_pos;
  933. error.leftmost_column = error.start_column;
  934. push_error(error);
  935. prev = 0;
  936. }
  937. result += ch;
  938. _advance();
  939. if (ch == '\n') {
  940. newline(false);
  941. }
  942. }
  943. }
  944. if (prev != 0) {
  945. Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
  946. error.start_column = prev_pos;
  947. error.leftmost_column = error.start_column;
  948. push_error(error);
  949. prev = 0;
  950. }
  951. // Make the literal.
  952. Variant string;
  953. switch (type) {
  954. case STRING_NAME:
  955. string = StringName(result);
  956. break;
  957. case STRING_NODEPATH:
  958. string = NodePath(result);
  959. break;
  960. case STRING_REGULAR:
  961. string = result;
  962. break;
  963. }
  964. return make_literal(string);
  965. }
  966. void GDScriptTokenizer::check_indent() {
  967. ERR_FAIL_COND_MSG(column != 1, "Checking tokenizer indentation in the middle of a line.");
  968. if (_is_at_end()) {
  969. // Send dedents for every indent level.
  970. pending_indents -= indent_level();
  971. indent_stack.clear();
  972. return;
  973. }
  974. for (;;) {
  975. char32_t current_indent_char = _peek();
  976. int indent_count = 0;
  977. if (current_indent_char != ' ' && current_indent_char != '\t' && current_indent_char != '\r' && current_indent_char != '\n' && current_indent_char != '#') {
  978. // First character of the line is not whitespace, so we clear all indentation levels.
  979. // Unless we are in a continuation or in multiline mode (inside expression).
  980. if (line_continuation || multiline_mode) {
  981. return;
  982. }
  983. pending_indents -= indent_level();
  984. indent_stack.clear();
  985. return;
  986. }
  987. if (_peek() == '\r') {
  988. _advance();
  989. if (_peek() != '\n') {
  990. push_error("Stray carriage return character in source code.");
  991. }
  992. }
  993. if (_peek() == '\n') {
  994. // Empty line, keep going.
  995. _advance();
  996. newline(false);
  997. continue;
  998. }
  999. // Check indent level.
  1000. bool mixed = false;
  1001. while (!_is_at_end()) {
  1002. char32_t space = _peek();
  1003. if (space == '\t') {
  1004. // Consider individual tab columns.
  1005. column += tab_size - 1;
  1006. indent_count += tab_size;
  1007. } else if (space == ' ') {
  1008. indent_count += 1;
  1009. } else {
  1010. break;
  1011. }
  1012. mixed = mixed || space != current_indent_char;
  1013. _advance();
  1014. }
  1015. if (mixed && !(line_continuation || multiline_mode)) {
  1016. Token error = make_error("Mixed use of tabs and spaces for indentation.");
  1017. error.start_line = line;
  1018. error.start_column = 1;
  1019. error.leftmost_column = 1;
  1020. error.rightmost_column = column;
  1021. push_error(error);
  1022. }
  1023. if (_is_at_end()) {
  1024. // Reached the end with an empty line, so just dedent as much as needed.
  1025. pending_indents -= indent_level();
  1026. indent_stack.clear();
  1027. return;
  1028. }
  1029. if (_peek() == '\r') {
  1030. _advance();
  1031. if (_peek() != '\n') {
  1032. push_error("Stray carriage return character in source code.");
  1033. }
  1034. }
  1035. if (_peek() == '\n') {
  1036. // Empty line, keep going.
  1037. _advance();
  1038. newline(false);
  1039. continue;
  1040. }
  1041. if (_peek() == '#') {
  1042. // Comment. Advance to the next line.
  1043. #ifdef TOOLS_ENABLED
  1044. String comment;
  1045. while (_peek() != '\n' && !_is_at_end()) {
  1046. comment += _advance();
  1047. }
  1048. comments[line] = CommentData(comment, true);
  1049. #else
  1050. while (_peek() != '\n' && !_is_at_end()) {
  1051. _advance();
  1052. }
  1053. #endif // TOOLS_ENABLED
  1054. if (_is_at_end()) {
  1055. // Reached the end with an empty line, so just dedent as much as needed.
  1056. pending_indents -= indent_level();
  1057. indent_stack.clear();
  1058. return;
  1059. }
  1060. _advance(); // Consume '\n'.
  1061. newline(false);
  1062. continue;
  1063. }
  1064. if (line_continuation || multiline_mode) {
  1065. // We cleared up all the whitespace at the beginning of the line.
  1066. // But if this is a continuation or multiline mode and we don't want any indentation change.
  1067. return;
  1068. }
  1069. // Check if indentation character is consistent.
  1070. if (indent_char == '\0') {
  1071. // First time indenting, choose character now.
  1072. indent_char = current_indent_char;
  1073. } else if (current_indent_char != indent_char) {
  1074. Token error = make_error(vformat("Used %s character for indentation instead of %s as used before in the file.",
  1075. _get_indent_char_name(current_indent_char), _get_indent_char_name(indent_char)));
  1076. error.start_line = line;
  1077. error.start_column = 1;
  1078. error.leftmost_column = 1;
  1079. error.rightmost_column = column;
  1080. push_error(error);
  1081. }
  1082. // Now we can do actual indentation changes.
  1083. // Check if indent or dedent.
  1084. int previous_indent = 0;
  1085. if (indent_level() > 0) {
  1086. previous_indent = indent_stack.back()->get();
  1087. }
  1088. if (indent_count == previous_indent) {
  1089. // No change in indentation.
  1090. return;
  1091. }
  1092. if (indent_count > previous_indent) {
  1093. // Indentation increased.
  1094. indent_stack.push_back(indent_count);
  1095. pending_indents++;
  1096. } else {
  1097. // Indentation decreased (dedent).
  1098. if (indent_level() == 0) {
  1099. push_error("Tokenizer bug: trying to dedent without previous indent.");
  1100. return;
  1101. }
  1102. while (indent_level() > 0 && indent_stack.back()->get() > indent_count) {
  1103. indent_stack.pop_back();
  1104. pending_indents--;
  1105. }
  1106. if ((indent_level() > 0 && indent_stack.back()->get() != indent_count) || (indent_level() == 0 && indent_count != 0)) {
  1107. // Mismatched indentation alignment.
  1108. Token error = make_error("Unindent doesn't match the previous indentation level.");
  1109. error.start_line = line;
  1110. error.start_column = 1;
  1111. error.leftmost_column = 1;
  1112. error.end_column = column + 1;
  1113. error.rightmost_column = column + 1;
  1114. push_error(error);
  1115. // Still, we'll be lenient and keep going, so keep this level in the stack.
  1116. indent_stack.push_back(indent_count);
  1117. }
  1118. }
  1119. break; // Get out of the loop in any case.
  1120. }
  1121. }
  1122. String GDScriptTokenizer::_get_indent_char_name(char32_t ch) {
  1123. ERR_FAIL_COND_V(ch != ' ' && ch != '\t', String(&ch, 1).c_escape());
  1124. return ch == ' ' ? "space" : "tab";
  1125. }
  1126. void GDScriptTokenizer::_skip_whitespace() {
  1127. if (pending_indents != 0) {
  1128. // Still have some indent/dedent tokens to give.
  1129. return;
  1130. }
  1131. bool is_bol = column == 1; // Beginning of line.
  1132. if (is_bol) {
  1133. check_indent();
  1134. return;
  1135. }
  1136. for (;;) {
  1137. char32_t c = _peek();
  1138. switch (c) {
  1139. case ' ':
  1140. _advance();
  1141. break;
  1142. case '\t':
  1143. _advance();
  1144. // Consider individual tab columns.
  1145. column += tab_size - 1;
  1146. break;
  1147. case '\r':
  1148. _advance(); // Consume either way.
  1149. if (_peek() != '\n') {
  1150. push_error("Stray carriage return character in source code.");
  1151. return;
  1152. }
  1153. break;
  1154. case '\n':
  1155. _advance();
  1156. newline(!is_bol); // Don't create new line token if line is empty.
  1157. check_indent();
  1158. break;
  1159. case '#': {
  1160. // Comment.
  1161. #ifdef TOOLS_ENABLED
  1162. String comment;
  1163. while (_peek() != '\n' && !_is_at_end()) {
  1164. comment += _advance();
  1165. }
  1166. comments[line] = CommentData(comment, is_bol);
  1167. #else
  1168. while (_peek() != '\n' && !_is_at_end()) {
  1169. _advance();
  1170. }
  1171. #endif // TOOLS_ENABLED
  1172. if (_is_at_end()) {
  1173. return;
  1174. }
  1175. _advance(); // Consume '\n'
  1176. newline(!is_bol);
  1177. check_indent();
  1178. } break;
  1179. default:
  1180. return;
  1181. }
  1182. }
  1183. }
  1184. GDScriptTokenizer::Token GDScriptTokenizer::scan() {
  1185. if (has_error()) {
  1186. return pop_error();
  1187. }
  1188. _skip_whitespace();
  1189. if (pending_newline) {
  1190. pending_newline = false;
  1191. if (!multiline_mode) {
  1192. // Don't return newline tokens on multiline mode.
  1193. return last_newline;
  1194. }
  1195. }
  1196. // Check for potential errors after skipping whitespace().
  1197. if (has_error()) {
  1198. return pop_error();
  1199. }
  1200. _start = _current;
  1201. start_line = line;
  1202. start_column = column;
  1203. leftmost_column = column;
  1204. rightmost_column = column;
  1205. if (pending_indents != 0) {
  1206. // Adjust position for indent.
  1207. _start -= start_column - 1;
  1208. start_column = 1;
  1209. leftmost_column = 1;
  1210. if (pending_indents > 0) {
  1211. // Indents.
  1212. pending_indents--;
  1213. return make_token(Token::INDENT);
  1214. } else {
  1215. // Dedents.
  1216. pending_indents++;
  1217. Token dedent = make_token(Token::DEDENT);
  1218. dedent.end_column += 1;
  1219. dedent.rightmost_column += 1;
  1220. return dedent;
  1221. }
  1222. }
  1223. if (_is_at_end()) {
  1224. return make_token(Token::TK_EOF);
  1225. }
  1226. const char32_t c = _advance();
  1227. if (c == '\\') {
  1228. // Line continuation with backslash.
  1229. if (_peek() == '\r') {
  1230. if (_peek(1) != '\n') {
  1231. return make_error("Unexpected carriage return character.");
  1232. }
  1233. _advance();
  1234. }
  1235. if (_peek() != '\n') {
  1236. return make_error("Expected new line after \"\\\".");
  1237. }
  1238. _advance();
  1239. newline(false);
  1240. line_continuation = true;
  1241. return scan(); // Recurse to get next token.
  1242. }
  1243. line_continuation = false;
  1244. if (is_digit(c)) {
  1245. return number();
  1246. } else if (is_unicode_identifier_start(c)) {
  1247. return potential_identifier();
  1248. }
  1249. switch (c) {
  1250. // String literals.
  1251. case '"':
  1252. case '\'':
  1253. return string();
  1254. // Annotation.
  1255. case '@':
  1256. return annotation();
  1257. // Single characters.
  1258. case '~':
  1259. return make_token(Token::TILDE);
  1260. case ',':
  1261. return make_token(Token::COMMA);
  1262. case ':':
  1263. return make_token(Token::COLON);
  1264. case ';':
  1265. return make_token(Token::SEMICOLON);
  1266. case '$':
  1267. return make_token(Token::DOLLAR);
  1268. case '?':
  1269. return make_token(Token::QUESTION_MARK);
  1270. case '`':
  1271. return make_token(Token::BACKTICK);
  1272. // Parens.
  1273. case '(':
  1274. push_paren('(');
  1275. return make_token(Token::PARENTHESIS_OPEN);
  1276. case '[':
  1277. push_paren('[');
  1278. return make_token(Token::BRACKET_OPEN);
  1279. case '{':
  1280. push_paren('{');
  1281. return make_token(Token::BRACE_OPEN);
  1282. case ')':
  1283. if (!pop_paren('(')) {
  1284. return make_paren_error(c);
  1285. }
  1286. return make_token(Token::PARENTHESIS_CLOSE);
  1287. case ']':
  1288. if (!pop_paren('[')) {
  1289. return make_paren_error(c);
  1290. }
  1291. return make_token(Token::BRACKET_CLOSE);
  1292. case '}':
  1293. if (!pop_paren('{')) {
  1294. return make_paren_error(c);
  1295. }
  1296. return make_token(Token::BRACE_CLOSE);
  1297. // Double characters.
  1298. case '!':
  1299. if (_peek() == '=') {
  1300. _advance();
  1301. return make_token(Token::BANG_EQUAL);
  1302. } else {
  1303. return make_token(Token::BANG);
  1304. }
  1305. case '.':
  1306. if (_peek() == '.') {
  1307. _advance();
  1308. return make_token(Token::PERIOD_PERIOD);
  1309. } else if (is_digit(_peek())) {
  1310. // Number starting with '.'.
  1311. return number();
  1312. } else {
  1313. return make_token(Token::PERIOD);
  1314. }
  1315. case '+':
  1316. if (_peek() == '=') {
  1317. _advance();
  1318. return make_token(Token::PLUS_EQUAL);
  1319. } else {
  1320. return make_token(Token::PLUS);
  1321. }
  1322. case '-':
  1323. if (_peek() == '=') {
  1324. _advance();
  1325. return make_token(Token::MINUS_EQUAL);
  1326. } else if (_peek() == '>') {
  1327. _advance();
  1328. return make_token(Token::FORWARD_ARROW);
  1329. } else {
  1330. return make_token(Token::MINUS);
  1331. }
  1332. case '*':
  1333. if (_peek() == '=') {
  1334. _advance();
  1335. return make_token(Token::STAR_EQUAL);
  1336. } else if (_peek() == '*') {
  1337. if (_peek(1) == '=') {
  1338. _advance();
  1339. _advance(); // Advance both '*' and '='
  1340. return make_token(Token::STAR_STAR_EQUAL);
  1341. }
  1342. _advance();
  1343. return make_token(Token::STAR_STAR);
  1344. } else {
  1345. return make_token(Token::STAR);
  1346. }
  1347. case '/':
  1348. if (_peek() == '=') {
  1349. _advance();
  1350. return make_token(Token::SLASH_EQUAL);
  1351. } else {
  1352. return make_token(Token::SLASH);
  1353. }
  1354. case '%':
  1355. if (_peek() == '=') {
  1356. _advance();
  1357. return make_token(Token::PERCENT_EQUAL);
  1358. } else {
  1359. return make_token(Token::PERCENT);
  1360. }
  1361. case '^':
  1362. if (_peek() == '=') {
  1363. _advance();
  1364. return make_token(Token::CARET_EQUAL);
  1365. } else if (_peek() == '"' || _peek() == '\'') {
  1366. // Node path
  1367. return string();
  1368. } else {
  1369. return make_token(Token::CARET);
  1370. }
  1371. case '&':
  1372. if (_peek() == '&') {
  1373. _advance();
  1374. return make_token(Token::AMPERSAND_AMPERSAND);
  1375. } else if (_peek() == '=') {
  1376. _advance();
  1377. return make_token(Token::AMPERSAND_EQUAL);
  1378. } else if (_peek() == '"' || _peek() == '\'') {
  1379. // String Name
  1380. return string();
  1381. } else {
  1382. return make_token(Token::AMPERSAND);
  1383. }
  1384. case '|':
  1385. if (_peek() == '|') {
  1386. _advance();
  1387. return make_token(Token::PIPE_PIPE);
  1388. } else if (_peek() == '=') {
  1389. _advance();
  1390. return make_token(Token::PIPE_EQUAL);
  1391. } else {
  1392. return make_token(Token::PIPE);
  1393. }
  1394. // Potential VCS conflict markers.
  1395. case '=':
  1396. if (_peek() == '=') {
  1397. return check_vcs_marker('=', Token::EQUAL_EQUAL);
  1398. } else {
  1399. return make_token(Token::EQUAL);
  1400. }
  1401. case '<':
  1402. if (_peek() == '=') {
  1403. _advance();
  1404. return make_token(Token::LESS_EQUAL);
  1405. } else if (_peek() == '<') {
  1406. if (_peek(1) == '=') {
  1407. _advance();
  1408. _advance(); // Advance both '<' and '='
  1409. return make_token(Token::LESS_LESS_EQUAL);
  1410. } else {
  1411. return check_vcs_marker('<', Token::LESS_LESS);
  1412. }
  1413. } else {
  1414. return make_token(Token::LESS);
  1415. }
  1416. case '>':
  1417. if (_peek() == '=') {
  1418. _advance();
  1419. return make_token(Token::GREATER_EQUAL);
  1420. } else if (_peek() == '>') {
  1421. if (_peek(1) == '=') {
  1422. _advance();
  1423. _advance(); // Advance both '>' and '='
  1424. return make_token(Token::GREATER_GREATER_EQUAL);
  1425. } else {
  1426. return check_vcs_marker('>', Token::GREATER_GREATER);
  1427. }
  1428. } else {
  1429. return make_token(Token::GREATER);
  1430. }
  1431. default:
  1432. if (is_whitespace(c)) {
  1433. return make_error(vformat(R"(Invalid white space character "\\u%X".)", static_cast<int32_t>(c)));
  1434. } else {
  1435. return make_error(vformat(R"(Unknown character "%s".)", String(&c, 1)));
  1436. }
  1437. }
  1438. }
  1439. GDScriptTokenizer::GDScriptTokenizer() {
  1440. #ifdef TOOLS_ENABLED
  1441. if (EditorSettings::get_singleton()) {
  1442. tab_size = EditorSettings::get_singleton()->get_setting("text_editor/behavior/indent/size");
  1443. }
  1444. #endif // TOOLS_ENABLED
  1445. #ifdef DEBUG_ENABLED
  1446. make_keyword_list();
  1447. #endif // DEBUG_ENABLED
  1448. }