1/**************************************************************************/
2/* gdscript_tokenizer.cpp */
3/**************************************************************************/
4/* This file is part of: */
5/* GODOT ENGINE */
6/* https://godotengine.org */
7/**************************************************************************/
8/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
9/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
10/* */
11/* Permission is hereby granted, free of charge, to any person obtaining */
12/* a copy of this software and associated documentation files (the */
13/* "Software"), to deal in the Software without restriction, including */
14/* without limitation the rights to use, copy, modify, merge, publish, */
15/* distribute, sublicense, and/or sell copies of the Software, and to */
16/* permit persons to whom the Software is furnished to do so, subject to */
17/* the following conditions: */
18/* */
19/* The above copyright notice and this permission notice shall be */
20/* included in all copies or substantial portions of the Software. */
21/* */
22/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
23/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
24/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
25/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
26/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
27/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
28/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
29/**************************************************************************/
30
31#include "gdscript_tokenizer.h"
32
33#include "core/error/error_macros.h"
34#include "core/string/char_utils.h"
35
36#ifdef DEBUG_ENABLED
37#include "servers/text_server.h"
38#endif
39
40#ifdef TOOLS_ENABLED
41#include "editor/editor_settings.h"
42#endif
43
44static const char *token_names[] = {
45 "Empty", // EMPTY,
46 // Basic
47 "Annotation", // ANNOTATION
48 "Identifier", // IDENTIFIER,
49 "Literal", // LITERAL,
50 // Comparison
51 "<", // LESS,
52 "<=", // LESS_EQUAL,
53 ">", // GREATER,
54 ">=", // GREATER_EQUAL,
55 "==", // EQUAL_EQUAL,
56 "!=", // BANG_EQUAL,
57 // Logical
58 "and", // AND,
59 "or", // OR,
60 "not", // NOT,
61 "&&", // AMPERSAND_AMPERSAND,
62 "||", // PIPE_PIPE,
63 "!", // BANG,
64 // Bitwise
65 "&", // AMPERSAND,
66 "|", // PIPE,
67 "~", // TILDE,
68 "^", // CARET,
69 "<<", // LESS_LESS,
70 ">>", // GREATER_GREATER,
71 // Math
72 "+", // PLUS,
73 "-", // MINUS,
74 "*", // STAR,
75 "**", // STAR_STAR,
76 "/", // SLASH,
77 "%", // PERCENT,
78 // Assignment
79 "=", // EQUAL,
80 "+=", // PLUS_EQUAL,
81 "-=", // MINUS_EQUAL,
82 "*=", // STAR_EQUAL,
83 "**=", // STAR_STAR_EQUAL,
84 "/=", // SLASH_EQUAL,
85 "%=", // PERCENT_EQUAL,
86 "<<=", // LESS_LESS_EQUAL,
87 ">>=", // GREATER_GREATER_EQUAL,
88 "&=", // AMPERSAND_EQUAL,
89 "|=", // PIPE_EQUAL,
90 "^=", // CARET_EQUAL,
91 // Control flow
92 "if", // IF,
93 "elif", // ELIF,
94 "else", // ELSE,
95 "for", // FOR,
96 "while", // WHILE,
97 "break", // BREAK,
98 "continue", // CONTINUE,
99 "pass", // PASS,
100 "return", // RETURN,
101 "match", // MATCH,
102 // Keywords
103 "as", // AS,
104 "assert", // ASSERT,
105 "await", // AWAIT,
106 "breakpoint", // BREAKPOINT,
107 "class", // CLASS,
108 "class_name", // CLASS_NAME,
109 "const", // CONST,
110 "enum", // ENUM,
111 "extends", // EXTENDS,
112 "func", // FUNC,
113 "in", // IN,
114 "is", // IS,
115 "namespace", // NAMESPACE
116 "preload", // PRELOAD,
117 "self", // SELF,
118 "signal", // SIGNAL,
119 "static", // STATIC,
120 "super", // SUPER,
121 "trait", // TRAIT,
122 "var", // VAR,
123 "void", // VOID,
124 "yield", // YIELD,
125 // Punctuation
126 "[", // BRACKET_OPEN,
127 "]", // BRACKET_CLOSE,
128 "{", // BRACE_OPEN,
129 "}", // BRACE_CLOSE,
130 "(", // PARENTHESIS_OPEN,
131 ")", // PARENTHESIS_CLOSE,
132 ",", // COMMA,
133 ";", // SEMICOLON,
134 ".", // PERIOD,
135 "..", // PERIOD_PERIOD,
136 ":", // COLON,
137 "$", // DOLLAR,
138 "->", // FORWARD_ARROW,
139 "_", // UNDERSCORE,
140 // Whitespace
141 "Newline", // NEWLINE,
142 "Indent", // INDENT,
143 "Dedent", // DEDENT,
144 // Constants
145 "PI", // CONST_PI,
146 "TAU", // CONST_TAU,
147 "INF", // CONST_INF,
148 "NaN", // CONST_NAN,
149 // Error message improvement
150 "VCS conflict marker", // VCS_CONFLICT_MARKER,
151 "`", // BACKTICK,
152 "?", // QUESTION_MARK,
153 // Special
154 "Error", // ERROR,
155 "End of file", // EOF,
156};
157
158// Avoid desync.
159static_assert(sizeof(token_names) / sizeof(token_names[0]) == GDScriptTokenizer::Token::TK_MAX, "Amount of token names don't match the amount of token types.");
160
161const char *GDScriptTokenizer::Token::get_name() const {
162 ERR_FAIL_INDEX_V_MSG(type, TK_MAX, "<error>", "Using token type out of the enum.");
163 return token_names[type];
164}
165
166bool GDScriptTokenizer::Token::can_precede_bin_op() const {
167 switch (type) {
168 case IDENTIFIER:
169 case LITERAL:
170 case SELF:
171 case BRACKET_CLOSE:
172 case BRACE_CLOSE:
173 case PARENTHESIS_CLOSE:
174 case CONST_PI:
175 case CONST_TAU:
176 case CONST_INF:
177 case CONST_NAN:
178 return true;
179 default:
180 return false;
181 }
182}
183
184bool GDScriptTokenizer::Token::is_identifier() const {
185 // Note: Most keywords should not be recognized as identifiers.
186 // These are only exceptions for stuff that already is on the engine's API.
187 switch (type) {
188 case IDENTIFIER:
189 case MATCH: // Used in String.match().
190 // Allow constants to be treated as regular identifiers.
191 case CONST_PI:
192 case CONST_INF:
193 case CONST_NAN:
194 case CONST_TAU:
195 return true;
196 default:
197 return false;
198 }
199}
200
201bool GDScriptTokenizer::Token::is_node_name() const {
202 // This is meant to allow keywords with the $ notation, but not as general identifiers.
203 switch (type) {
204 case IDENTIFIER:
205 case AND:
206 case AS:
207 case ASSERT:
208 case AWAIT:
209 case BREAK:
210 case BREAKPOINT:
211 case CLASS_NAME:
212 case CLASS:
213 case CONST:
214 case CONST_PI:
215 case CONST_INF:
216 case CONST_NAN:
217 case CONST_TAU:
218 case CONTINUE:
219 case ELIF:
220 case ELSE:
221 case ENUM:
222 case EXTENDS:
223 case FOR:
224 case FUNC:
225 case IF:
226 case IN:
227 case IS:
228 case MATCH:
229 case NAMESPACE:
230 case NOT:
231 case OR:
232 case PASS:
233 case PRELOAD:
234 case RETURN:
235 case SELF:
236 case SIGNAL:
237 case STATIC:
238 case SUPER:
239 case TRAIT:
240 case UNDERSCORE:
241 case VAR:
242 case VOID:
243 case WHILE:
244 case YIELD:
245 return true;
246 default:
247 return false;
248 }
249}
250
251String GDScriptTokenizer::get_token_name(Token::Type p_token_type) {
252 ERR_FAIL_INDEX_V_MSG(p_token_type, Token::TK_MAX, "<error>", "Using token type out of the enum.");
253 return token_names[p_token_type];
254}
255
256void GDScriptTokenizer::set_source_code(const String &p_source_code) {
257 source = p_source_code;
258 if (source.is_empty()) {
259 _source = U"";
260 } else {
261 _source = source.ptr();
262 }
263 _current = _source;
264 line = 1;
265 column = 1;
266 length = p_source_code.length();
267 position = 0;
268}
269
270void GDScriptTokenizer::set_cursor_position(int p_line, int p_column) {
271 cursor_line = p_line;
272 cursor_column = p_column;
273}
274
275void GDScriptTokenizer::set_multiline_mode(bool p_state) {
276 multiline_mode = p_state;
277}
278
279void GDScriptTokenizer::push_expression_indented_block() {
280 indent_stack_stack.push_back(indent_stack);
281}
282
283void GDScriptTokenizer::pop_expression_indented_block() {
284 ERR_FAIL_COND(indent_stack_stack.size() == 0);
285 indent_stack = indent_stack_stack.back()->get();
286 indent_stack_stack.pop_back();
287}
288
289int GDScriptTokenizer::get_cursor_line() const {
290 return cursor_line;
291}
292
293int GDScriptTokenizer::get_cursor_column() const {
294 return cursor_column;
295}
296
297bool GDScriptTokenizer::is_past_cursor() const {
298 if (line < cursor_line) {
299 return false;
300 }
301 if (line > cursor_line) {
302 return true;
303 }
304 if (column < cursor_column) {
305 return false;
306 }
307 return true;
308}
309
310char32_t GDScriptTokenizer::_advance() {
311 if (unlikely(_is_at_end())) {
312 return '\0';
313 }
314 _current++;
315 column++;
316 position++;
317 if (column > rightmost_column) {
318 rightmost_column = column;
319 }
320 if (unlikely(_is_at_end())) {
321 // Add extra newline even if it's not there, to satisfy the parser.
322 newline(true);
323 // Also add needed unindent.
324 check_indent();
325 }
326 return _peek(-1);
327}
328
329void GDScriptTokenizer::push_paren(char32_t p_char) {
330 paren_stack.push_back(p_char);
331}
332
333bool GDScriptTokenizer::pop_paren(char32_t p_expected) {
334 if (paren_stack.is_empty()) {
335 return false;
336 }
337 char32_t actual = paren_stack.back()->get();
338 paren_stack.pop_back();
339
340 return actual == p_expected;
341}
342
343GDScriptTokenizer::Token GDScriptTokenizer::pop_error() {
344 Token error = error_stack.back()->get();
345 error_stack.pop_back();
346 return error;
347}
348
349GDScriptTokenizer::Token GDScriptTokenizer::make_token(Token::Type p_type) {
350 Token token(p_type);
351 token.start_line = start_line;
352 token.end_line = line;
353 token.start_column = start_column;
354 token.end_column = column;
355 token.leftmost_column = leftmost_column;
356 token.rightmost_column = rightmost_column;
357 token.source = String(_start, _current - _start);
358
359 if (p_type != Token::ERROR && cursor_line > -1) {
360 // Also count whitespace after token.
361 int offset = 0;
362 while (_peek(offset) == ' ' || _peek(offset) == '\t') {
363 offset++;
364 }
365 int last_column = column + offset;
366 // Check cursor position in token.
367 if (start_line == line) {
368 // Single line token.
369 if (cursor_line == start_line && cursor_column >= start_column && cursor_column <= last_column) {
370 token.cursor_position = cursor_column - start_column;
371 if (cursor_column == start_column) {
372 token.cursor_place = CURSOR_BEGINNING;
373 } else if (cursor_column < column) {
374 token.cursor_place = CURSOR_MIDDLE;
375 } else {
376 token.cursor_place = CURSOR_END;
377 }
378 }
379 } else {
380 // Multi line token.
381 if (cursor_line == start_line && cursor_column >= start_column) {
382 // Is in first line.
383 token.cursor_position = cursor_column - start_column;
384 if (cursor_column == start_column) {
385 token.cursor_place = CURSOR_BEGINNING;
386 } else {
387 token.cursor_place = CURSOR_MIDDLE;
388 }
389 } else if (cursor_line == line && cursor_column <= last_column) {
390 // Is in last line.
391 token.cursor_position = cursor_column - start_column;
392 if (cursor_column < column) {
393 token.cursor_place = CURSOR_MIDDLE;
394 } else {
395 token.cursor_place = CURSOR_END;
396 }
397 } else if (cursor_line > start_line && cursor_line < line) {
398 // Is in middle line.
399 token.cursor_position = CURSOR_MIDDLE;
400 }
401 }
402 }
403
404 last_token = token;
405 return token;
406}
407
408GDScriptTokenizer::Token GDScriptTokenizer::make_literal(const Variant &p_literal) {
409 Token token = make_token(Token::LITERAL);
410 token.literal = p_literal;
411 return token;
412}
413
414GDScriptTokenizer::Token GDScriptTokenizer::make_identifier(const StringName &p_identifier) {
415 Token identifier = make_token(Token::IDENTIFIER);
416 identifier.literal = p_identifier;
417 return identifier;
418}
419
420GDScriptTokenizer::Token GDScriptTokenizer::make_error(const String &p_message) {
421 Token error = make_token(Token::ERROR);
422 error.literal = p_message;
423
424 return error;
425}
426
427void GDScriptTokenizer::push_error(const String &p_message) {
428 Token error = make_error(p_message);
429 error_stack.push_back(error);
430}
431
432void GDScriptTokenizer::push_error(const Token &p_error) {
433 error_stack.push_back(p_error);
434}
435
436GDScriptTokenizer::Token GDScriptTokenizer::make_paren_error(char32_t p_paren) {
437 if (paren_stack.is_empty()) {
438 return make_error(vformat("Closing \"%c\" doesn't have an opening counterpart.", p_paren));
439 }
440 Token error = make_error(vformat("Closing \"%c\" doesn't match the opening \"%c\".", p_paren, paren_stack.back()->get()));
441 paren_stack.pop_back(); // Remove opening one anyway.
442 return error;
443}
444
445GDScriptTokenizer::Token GDScriptTokenizer::check_vcs_marker(char32_t p_test, Token::Type p_double_type) {
446 const char32_t *next = _current + 1;
447 int chars = 2; // Two already matched.
448
449 // Test before consuming characters, since we don't want to consume more than needed.
450 while (*next == p_test) {
451 chars++;
452 next++;
453 }
454 if (chars >= 7) {
455 // It is a VCS conflict marker.
456 while (chars > 1) {
457 // Consume all characters (first was already consumed by scan()).
458 _advance();
459 chars--;
460 }
461 return make_token(Token::VCS_CONFLICT_MARKER);
462 } else {
463 // It is only a regular double character token, so we consume the second character.
464 _advance();
465 return make_token(p_double_type);
466 }
467}
468
469GDScriptTokenizer::Token GDScriptTokenizer::annotation() {
470 if (is_unicode_identifier_start(_peek())) {
471 _advance(); // Consume start character.
472 } else {
473 push_error("Expected annotation identifier after \"@\".");
474 }
475 while (is_unicode_identifier_continue(_peek())) {
476 // Consume all identifier characters.
477 _advance();
478 }
479 Token annotation = make_token(Token::ANNOTATION);
480 annotation.literal = StringName(annotation.source);
481 return annotation;
482}
483
484#define KEYWORDS(KEYWORD_GROUP, KEYWORD) \
485 KEYWORD_GROUP('a') \
486 KEYWORD("as", Token::AS) \
487 KEYWORD("and", Token::AND) \
488 KEYWORD("assert", Token::ASSERT) \
489 KEYWORD("await", Token::AWAIT) \
490 KEYWORD_GROUP('b') \
491 KEYWORD("break", Token::BREAK) \
492 KEYWORD("breakpoint", Token::BREAKPOINT) \
493 KEYWORD_GROUP('c') \
494 KEYWORD("class", Token::CLASS) \
495 KEYWORD("class_name", Token::CLASS_NAME) \
496 KEYWORD("const", Token::CONST) \
497 KEYWORD("continue", Token::CONTINUE) \
498 KEYWORD_GROUP('e') \
499 KEYWORD("elif", Token::ELIF) \
500 KEYWORD("else", Token::ELSE) \
501 KEYWORD("enum", Token::ENUM) \
502 KEYWORD("extends", Token::EXTENDS) \
503 KEYWORD_GROUP('f') \
504 KEYWORD("for", Token::FOR) \
505 KEYWORD("func", Token::FUNC) \
506 KEYWORD_GROUP('i') \
507 KEYWORD("if", Token::IF) \
508 KEYWORD("in", Token::IN) \
509 KEYWORD("is", Token::IS) \
510 KEYWORD_GROUP('m') \
511 KEYWORD("match", Token::MATCH) \
512 KEYWORD_GROUP('n') \
513 KEYWORD("namespace", Token::NAMESPACE) \
514 KEYWORD("not", Token::NOT) \
515 KEYWORD_GROUP('o') \
516 KEYWORD("or", Token::OR) \
517 KEYWORD_GROUP('p') \
518 KEYWORD("pass", Token::PASS) \
519 KEYWORD("preload", Token::PRELOAD) \
520 KEYWORD_GROUP('r') \
521 KEYWORD("return", Token::RETURN) \
522 KEYWORD_GROUP('s') \
523 KEYWORD("self", Token::SELF) \
524 KEYWORD("signal", Token::SIGNAL) \
525 KEYWORD("static", Token::STATIC) \
526 KEYWORD("super", Token::SUPER) \
527 KEYWORD_GROUP('t') \
528 KEYWORD("trait", Token::TRAIT) \
529 KEYWORD_GROUP('v') \
530 KEYWORD("var", Token::VAR) \
531 KEYWORD("void", Token::VOID) \
532 KEYWORD_GROUP('w') \
533 KEYWORD("while", Token::WHILE) \
534 KEYWORD_GROUP('y') \
535 KEYWORD("yield", Token::YIELD) \
536 KEYWORD_GROUP('I') \
537 KEYWORD("INF", Token::CONST_INF) \
538 KEYWORD_GROUP('N') \
539 KEYWORD("NAN", Token::CONST_NAN) \
540 KEYWORD_GROUP('P') \
541 KEYWORD("PI", Token::CONST_PI) \
542 KEYWORD_GROUP('T') \
543 KEYWORD("TAU", Token::CONST_TAU)
544
545#define MIN_KEYWORD_LENGTH 2
546#define MAX_KEYWORD_LENGTH 10
547
548#ifdef DEBUG_ENABLED
549void GDScriptTokenizer::make_keyword_list() {
550#define KEYWORD_LINE(keyword, token_type) keyword,
551#define KEYWORD_GROUP_IGNORE(group)
552 keyword_list = {
553 KEYWORDS(KEYWORD_GROUP_IGNORE, KEYWORD_LINE)
554 };
555#undef KEYWORD_LINE
556#undef KEYWORD_GROUP_IGNORE
557}
558#endif // DEBUG_ENABLED
559
560GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() {
561 bool only_ascii = _peek(-1) < 128;
562
563 // Consume all identifier characters.
564 while (is_unicode_identifier_continue(_peek())) {
565 char32_t c = _advance();
566 only_ascii = only_ascii && c < 128;
567 }
568
569 int len = _current - _start;
570
571 if (len == 1 && _peek(-1) == '_') {
572 // Lone underscore.
573 return make_token(Token::UNDERSCORE);
574 }
575
576 String name(_start, len);
577 if (len < MIN_KEYWORD_LENGTH || len > MAX_KEYWORD_LENGTH) {
578 // Cannot be a keyword, as the length doesn't match any.
579 return make_identifier(name);
580 }
581
582 if (!only_ascii) {
583 // Kept here in case the order with push_error matters.
584 Token id = make_identifier(name);
585
586#ifdef DEBUG_ENABLED
587 // Additional checks for identifiers but only in debug and if it's available in TextServer.
588 if (TS->has_feature(TextServer::FEATURE_UNICODE_SECURITY)) {
589 int64_t confusable = TS->is_confusable(name, keyword_list);
590 if (confusable >= 0) {
591 push_error(vformat(R"(Identifier "%s" is visually similar to the GDScript keyword "%s" and thus not allowed.)", name, keyword_list[confusable]));
592 }
593 }
594#endif // DEBUG_ENABLED
595
596 // Cannot be a keyword, as keywords are ASCII only.
597 return id;
598 }
599
600 // Define some helper macros for the switch case.
601#define KEYWORD_GROUP_CASE(char) \
602 break; \
603 case char:
604#define KEYWORD(keyword, token_type) \
605 { \
606 const int keyword_length = sizeof(keyword) - 1; \
607 static_assert(keyword_length <= MAX_KEYWORD_LENGTH, "There's a keyword longer than the defined maximum length"); \
608 static_assert(keyword_length >= MIN_KEYWORD_LENGTH, "There's a keyword shorter than the defined minimum length"); \
609 if (keyword_length == len && name == keyword) { \
610 return make_token(token_type); \
611 } \
612 }
613
614 // Find if it's a keyword.
615 switch (_start[0]) {
616 default:
617 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
618 break;
619 }
620
621 // Check if it's a special literal
622 if (len == 4) {
623 if (name == "true") {
624 return make_literal(true);
625 } else if (name == "null") {
626 return make_literal(Variant());
627 }
628 } else if (len == 5) {
629 if (name == "false") {
630 return make_literal(false);
631 }
632 }
633
634 // Not a keyword, so must be an identifier.
635 return make_identifier(name);
636
637#undef KEYWORD_GROUP_CASE
638#undef KEYWORD
639}
640
641#undef MAX_KEYWORD_LENGTH
642#undef MIN_KEYWORD_LENGTH
643#undef KEYWORDS
644
645void GDScriptTokenizer::newline(bool p_make_token) {
646 // Don't overwrite previous newline, nor create if we want a line continuation.
647 if (p_make_token && !pending_newline && !line_continuation) {
648 Token newline(Token::NEWLINE);
649 newline.start_line = line;
650 newline.end_line = line;
651 newline.start_column = column - 1;
652 newline.end_column = column;
653 newline.leftmost_column = newline.start_column;
654 newline.rightmost_column = newline.end_column;
655 pending_newline = true;
656 last_token = newline;
657 last_newline = newline;
658 }
659
660 // Increment line/column counters.
661 line++;
662 column = 1;
663 leftmost_column = 1;
664}
665
666GDScriptTokenizer::Token GDScriptTokenizer::number() {
667 int base = 10;
668 bool has_decimal = false;
669 bool has_exponent = false;
670 bool has_error = false;
671 bool (*digit_check_func)(char32_t) = is_digit;
672
673 // Sign before hexadecimal or binary.
674 if ((_peek(-1) == '+' || _peek(-1) == '-') && _peek() == '0') {
675 _advance();
676 }
677
678 if (_peek(-1) == '.') {
679 has_decimal = true;
680 } else if (_peek(-1) == '0') {
681 if (_peek() == 'x') {
682 // Hexadecimal.
683 base = 16;
684 digit_check_func = is_hex_digit;
685 _advance();
686 } else if (_peek() == 'b') {
687 // Binary.
688 base = 2;
689 digit_check_func = is_binary_digit;
690 _advance();
691 }
692 }
693
694 if (base != 10 && is_underscore(_peek())) { // Disallow `0x_` and `0b_`.
695 Token error = make_error(vformat(R"(Unexpected underscore after "0%c".)", _peek(-1)));
696 error.start_column = column;
697 error.leftmost_column = column;
698 error.end_column = column + 1;
699 error.rightmost_column = column + 1;
700 push_error(error);
701 has_error = true;
702 }
703 bool previous_was_underscore = false; // Allow `_` to be used in a number, for readability.
704 while (digit_check_func(_peek()) || is_underscore(_peek())) {
705 if (is_underscore(_peek())) {
706 if (previous_was_underscore) {
707 Token error = make_error(R"(Multiple underscores cannot be adjacent in a numeric literal.)");
708 error.start_column = column;
709 error.leftmost_column = column;
710 error.end_column = column + 1;
711 error.rightmost_column = column + 1;
712 push_error(error);
713 }
714 previous_was_underscore = true;
715 } else {
716 previous_was_underscore = false;
717 }
718 _advance();
719 }
720
721 // It might be a ".." token (instead of decimal point) so we check if it's not.
722 if (_peek() == '.' && _peek(1) != '.') {
723 if (base == 10 && !has_decimal) {
724 has_decimal = true;
725 } else if (base == 10) {
726 Token error = make_error("Cannot use a decimal point twice in a number.");
727 error.start_column = column;
728 error.leftmost_column = column;
729 error.end_column = column + 1;
730 error.rightmost_column = column + 1;
731 push_error(error);
732 has_error = true;
733 } else if (base == 16) {
734 Token error = make_error("Cannot use a decimal point in a hexadecimal number.");
735 error.start_column = column;
736 error.leftmost_column = column;
737 error.end_column = column + 1;
738 error.rightmost_column = column + 1;
739 push_error(error);
740 has_error = true;
741 } else {
742 Token error = make_error("Cannot use a decimal point in a binary number.");
743 error.start_column = column;
744 error.leftmost_column = column;
745 error.end_column = column + 1;
746 error.rightmost_column = column + 1;
747 push_error(error);
748 has_error = true;
749 }
750 if (!has_error) {
751 _advance();
752
753 // Consume decimal digits.
754 if (is_underscore(_peek())) { // Disallow `10._`, but allow `10.`.
755 Token error = make_error(R"(Unexpected underscore after decimal point.)");
756 error.start_column = column;
757 error.leftmost_column = column;
758 error.end_column = column + 1;
759 error.rightmost_column = column + 1;
760 push_error(error);
761 has_error = true;
762 }
763 previous_was_underscore = false;
764 while (is_digit(_peek()) || is_underscore(_peek())) {
765 if (is_underscore(_peek())) {
766 if (previous_was_underscore) {
767 Token error = make_error(R"(Multiple underscores cannot be adjacent in a numeric literal.)");
768 error.start_column = column;
769 error.leftmost_column = column;
770 error.end_column = column + 1;
771 error.rightmost_column = column + 1;
772 push_error(error);
773 }
774 previous_was_underscore = true;
775 } else {
776 previous_was_underscore = false;
777 }
778 _advance();
779 }
780 }
781 }
782 if (base == 10) {
783 if (_peek() == 'e' || _peek() == 'E') {
784 has_exponent = true;
785 _advance();
786 if (_peek() == '+' || _peek() == '-') {
787 // Exponent sign.
788 _advance();
789 }
790 // Consume exponent digits.
791 if (!is_digit(_peek())) {
792 Token error = make_error(R"(Expected exponent value after "e".)");
793 error.start_column = column;
794 error.leftmost_column = column;
795 error.end_column = column + 1;
796 error.rightmost_column = column + 1;
797 push_error(error);
798 }
799 previous_was_underscore = false;
800 while (is_digit(_peek()) || is_underscore(_peek())) {
801 if (is_underscore(_peek())) {
802 if (previous_was_underscore) {
803 Token error = make_error(R"(Multiple underscores cannot be adjacent in a numeric literal.)");
804 error.start_column = column;
805 error.leftmost_column = column;
806 error.end_column = column + 1;
807 error.rightmost_column = column + 1;
808 push_error(error);
809 }
810 previous_was_underscore = true;
811 } else {
812 previous_was_underscore = false;
813 }
814 _advance();
815 }
816 }
817 }
818
819 // Detect extra decimal point.
820 if (!has_error && has_decimal && _peek() == '.' && _peek(1) != '.') {
821 Token error = make_error("Cannot use a decimal point twice in a number.");
822 error.start_column = column;
823 error.leftmost_column = column;
824 error.end_column = column + 1;
825 error.rightmost_column = column + 1;
826 push_error(error);
827 has_error = true;
828 } else if (is_unicode_identifier_start(_peek()) || is_unicode_identifier_continue(_peek())) {
829 // Letter at the end of the number.
830 push_error("Invalid numeric notation.");
831 }
832
833 // Create a string with the whole number.
834 int len = _current - _start;
835 String number = String(_start, len).replace("_", "");
836
837 // Convert to the appropriate literal type.
838 if (base == 16) {
839 int64_t value = number.hex_to_int();
840 return make_literal(value);
841 } else if (base == 2) {
842 int64_t value = number.bin_to_int();
843 return make_literal(value);
844 } else if (has_decimal || has_exponent) {
845 double value = number.to_float();
846 return make_literal(value);
847 } else {
848 int64_t value = number.to_int();
849 return make_literal(value);
850 }
851}
852
853GDScriptTokenizer::Token GDScriptTokenizer::string() {
854 enum StringType {
855 STRING_REGULAR,
856 STRING_NAME,
857 STRING_NODEPATH,
858 };
859
860 bool is_multiline = false;
861 StringType type = STRING_REGULAR;
862
863 if (_peek(-1) == '&') {
864 type = STRING_NAME;
865 _advance();
866 } else if (_peek(-1) == '^') {
867 type = STRING_NODEPATH;
868 _advance();
869 }
870
871 char32_t quote_char = _peek(-1);
872
873 if (_peek() == quote_char && _peek(1) == quote_char) {
874 is_multiline = true;
875 // Consume all quotes.
876 _advance();
877 _advance();
878 }
879
880 String result;
881 char32_t prev = 0;
882 int prev_pos = 0;
883
884 for (;;) {
885 // Consume actual string.
886 if (_is_at_end()) {
887 return make_error("Unterminated string.");
888 }
889
890 char32_t ch = _peek();
891
892 if (ch == 0x200E || ch == 0x200F || (ch >= 0x202A && ch <= 0x202E) || (ch >= 0x2066 && ch <= 0x2069)) {
893 Token error = make_error("Invisible text direction control character present in the string, escape it (\"\\u" + String::num_int64(ch, 16) + "\") to avoid confusion.");
894 error.start_column = column;
895 error.leftmost_column = error.start_column;
896 error.end_column = column + 1;
897 error.rightmost_column = error.end_column;
898 push_error(error);
899 }
900
901 if (ch == '\\') {
902 // Escape pattern.
903 _advance();
904 if (_is_at_end()) {
905 return make_error("Unterminated string.");
906 }
907
908 // Grab escape character.
909 char32_t code = _peek();
910 _advance();
911 if (_is_at_end()) {
912 return make_error("Unterminated string.");
913 }
914
915 char32_t escaped = 0;
916 bool valid_escape = true;
917
918 switch (code) {
919 case 'a':
920 escaped = '\a';
921 break;
922 case 'b':
923 escaped = '\b';
924 break;
925 case 'f':
926 escaped = '\f';
927 break;
928 case 'n':
929 escaped = '\n';
930 break;
931 case 'r':
932 escaped = '\r';
933 break;
934 case 't':
935 escaped = '\t';
936 break;
937 case 'v':
938 escaped = '\v';
939 break;
940 case '\'':
941 escaped = '\'';
942 break;
943 case '\"':
944 escaped = '\"';
945 break;
946 case '\\':
947 escaped = '\\';
948 break;
949 case 'U':
950 case 'u': {
951 // Hexadecimal sequence.
952 int hex_len = (code == 'U') ? 6 : 4;
953 for (int j = 0; j < hex_len; j++) {
954 if (_is_at_end()) {
955 return make_error("Unterminated string.");
956 }
957
958 char32_t digit = _peek();
959 char32_t value = 0;
960 if (is_digit(digit)) {
961 value = digit - '0';
962 } else if (digit >= 'a' && digit <= 'f') {
963 value = digit - 'a';
964 value += 10;
965 } else if (digit >= 'A' && digit <= 'F') {
966 value = digit - 'A';
967 value += 10;
968 } else {
969 // Make error, but keep parsing the string.
970 Token error = make_error("Invalid hexadecimal digit in unicode escape sequence.");
971 error.start_column = column;
972 error.leftmost_column = error.start_column;
973 error.end_column = column + 1;
974 error.rightmost_column = error.end_column;
975 push_error(error);
976 valid_escape = false;
977 break;
978 }
979
980 escaped <<= 4;
981 escaped |= value;
982
983 _advance();
984 }
985 } break;
986 case '\r':
987 if (_peek() != '\n') {
988 // Carriage return without newline in string. (???)
989 // Just add it to the string and keep going.
990 result += ch;
991 _advance();
992 break;
993 }
994 [[fallthrough]];
995 case '\n':
996 // Escaping newline.
997 newline(false);
998 valid_escape = false; // Don't add to the string.
999 break;
1000 default:
1001 Token error = make_error("Invalid escape in string.");
1002 error.start_column = column - 2;
1003 error.leftmost_column = error.start_column;
1004 push_error(error);
1005 valid_escape = false;
1006 break;
1007 }
1008 // Parse UTF-16 pair.
1009 if (valid_escape) {
1010 if ((escaped & 0xfffffc00) == 0xd800) {
1011 if (prev == 0) {
1012 prev = escaped;
1013 prev_pos = column - 2;
1014 continue;
1015 } else {
1016 Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
1017 error.start_column = column - 2;
1018 error.leftmost_column = error.start_column;
1019 push_error(error);
1020 valid_escape = false;
1021 prev = 0;
1022 }
1023 } else if ((escaped & 0xfffffc00) == 0xdc00) {
1024 if (prev == 0) {
1025 Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate");
1026 error.start_column = column - 2;
1027 error.leftmost_column = error.start_column;
1028 push_error(error);
1029 valid_escape = false;
1030 } else {
1031 escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
1032 prev = 0;
1033 }
1034 }
1035 if (prev != 0) {
1036 Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
1037 error.start_column = prev_pos;
1038 error.leftmost_column = error.start_column;
1039 push_error(error);
1040 prev = 0;
1041 }
1042 }
1043
1044 if (valid_escape) {
1045 result += escaped;
1046 }
1047 } else if (ch == quote_char) {
1048 if (prev != 0) {
1049 Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
1050 error.start_column = prev_pos;
1051 error.leftmost_column = error.start_column;
1052 push_error(error);
1053 prev = 0;
1054 }
1055 _advance();
1056 if (is_multiline) {
1057 if (_peek() == quote_char && _peek(1) == quote_char) {
1058 // Ended the multiline string. Consume all quotes.
1059 _advance();
1060 _advance();
1061 break;
1062 } else {
1063 // Not a multiline string termination, add consumed quote.
1064 result += quote_char;
1065 }
1066 } else {
1067 // Ended single-line string.
1068 break;
1069 }
1070 } else {
1071 if (prev != 0) {
1072 Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
1073 error.start_column = prev_pos;
1074 error.leftmost_column = error.start_column;
1075 push_error(error);
1076 prev = 0;
1077 }
1078 result += ch;
1079 _advance();
1080 if (ch == '\n') {
1081 newline(false);
1082 }
1083 }
1084 }
1085 if (prev != 0) {
1086 Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
1087 error.start_column = prev_pos;
1088 error.leftmost_column = error.start_column;
1089 push_error(error);
1090 prev = 0;
1091 }
1092
1093 // Make the literal.
1094 Variant string;
1095 switch (type) {
1096 case STRING_NAME:
1097 string = StringName(result);
1098 break;
1099 case STRING_NODEPATH:
1100 string = NodePath(result);
1101 break;
1102 case STRING_REGULAR:
1103 string = result;
1104 break;
1105 }
1106
1107 return make_literal(string);
1108}
1109
1110void GDScriptTokenizer::check_indent() {
1111 ERR_FAIL_COND_MSG(column != 1, "Checking tokenizer indentation in the middle of a line.");
1112
1113 if (_is_at_end()) {
1114 // Send dedents for every indent level.
1115 pending_indents -= indent_level();
1116 indent_stack.clear();
1117 return;
1118 }
1119
1120 for (;;) {
1121 char32_t current_indent_char = _peek();
1122 int indent_count = 0;
1123
1124 if (current_indent_char != ' ' && current_indent_char != '\t' && current_indent_char != '\r' && current_indent_char != '\n' && current_indent_char != '#') {
1125 // First character of the line is not whitespace, so we clear all indentation levels.
1126 // Unless we are in a continuation or in multiline mode (inside expression).
1127 if (line_continuation || multiline_mode) {
1128 return;
1129 }
1130 pending_indents -= indent_level();
1131 indent_stack.clear();
1132 return;
1133 }
1134
1135 if (_peek() == '\r') {
1136 _advance();
1137 if (_peek() != '\n') {
1138 push_error("Stray carriage return character in source code.");
1139 }
1140 }
1141 if (_peek() == '\n') {
1142 // Empty line, keep going.
1143 _advance();
1144 newline(false);
1145 continue;
1146 }
1147
1148 // Check indent level.
1149 bool mixed = false;
1150 while (!_is_at_end()) {
1151 char32_t space = _peek();
1152 if (space == '\t') {
1153 // Consider individual tab columns.
1154 column += tab_size - 1;
1155 indent_count += tab_size;
1156 } else if (space == ' ') {
1157 indent_count += 1;
1158 } else {
1159 break;
1160 }
1161 mixed = mixed || space != current_indent_char;
1162 _advance();
1163 }
1164
1165 if (_is_at_end()) {
1166 // Reached the end with an empty line, so just dedent as much as needed.
1167 pending_indents -= indent_level();
1168 indent_stack.clear();
1169 return;
1170 }
1171
1172 if (_peek() == '\r') {
1173 _advance();
1174 if (_peek() != '\n') {
1175 push_error("Stray carriage return character in source code.");
1176 }
1177 }
1178 if (_peek() == '\n') {
1179 // Empty line, keep going.
1180 _advance();
1181 newline(false);
1182 continue;
1183 }
1184 if (_peek() == '#') {
1185 // Comment. Advance to the next line.
1186#ifdef TOOLS_ENABLED
1187 String comment;
1188 while (_peek() != '\n' && !_is_at_end()) {
1189 comment += _advance();
1190 }
1191 comments[line] = CommentData(comment, true);
1192#else
1193 while (_peek() != '\n' && !_is_at_end()) {
1194 _advance();
1195 }
1196#endif // TOOLS_ENABLED
1197 if (_is_at_end()) {
1198 // Reached the end with an empty line, so just dedent as much as needed.
1199 pending_indents -= indent_level();
1200 indent_stack.clear();
1201 return;
1202 }
1203 _advance(); // Consume '\n'.
1204 newline(false);
1205 continue;
1206 }
1207
1208 if (mixed && !line_continuation && !multiline_mode) {
1209 Token error = make_error("Mixed use of tabs and spaces for indentation.");
1210 error.start_line = line;
1211 error.start_column = 1;
1212 error.leftmost_column = 1;
1213 error.rightmost_column = column;
1214 push_error(error);
1215 }
1216
1217 if (line_continuation || multiline_mode) {
1218 // We cleared up all the whitespace at the beginning of the line.
1219 // But if this is a continuation or multiline mode and we don't want any indentation change.
1220 return;
1221 }
1222
1223 // Check if indentation character is consistent.
1224 if (indent_char == '\0') {
1225 // First time indenting, choose character now.
1226 indent_char = current_indent_char;
1227 } else if (current_indent_char != indent_char) {
1228 Token error = make_error(vformat("Used %s character for indentation instead of %s as used before in the file.",
1229 _get_indent_char_name(current_indent_char), _get_indent_char_name(indent_char)));
1230 error.start_line = line;
1231 error.start_column = 1;
1232 error.leftmost_column = 1;
1233 error.rightmost_column = column;
1234 push_error(error);
1235 }
1236
1237 // Now we can do actual indentation changes.
1238
1239 // Check if indent or dedent.
1240 int previous_indent = 0;
1241 if (indent_level() > 0) {
1242 previous_indent = indent_stack.back()->get();
1243 }
1244 if (indent_count == previous_indent) {
1245 // No change in indentation.
1246 return;
1247 }
1248 if (indent_count > previous_indent) {
1249 // Indentation increased.
1250 indent_stack.push_back(indent_count);
1251 pending_indents++;
1252 } else {
1253 // Indentation decreased (dedent).
1254 if (indent_level() == 0) {
1255 push_error("Tokenizer bug: trying to dedent without previous indent.");
1256 return;
1257 }
1258 while (indent_level() > 0 && indent_stack.back()->get() > indent_count) {
1259 indent_stack.pop_back();
1260 pending_indents--;
1261 }
1262 if ((indent_level() > 0 && indent_stack.back()->get() != indent_count) || (indent_level() == 0 && indent_count != 0)) {
1263 // Mismatched indentation alignment.
1264 Token error = make_error("Unindent doesn't match the previous indentation level.");
1265 error.start_line = line;
1266 error.start_column = 1;
1267 error.leftmost_column = 1;
1268 error.end_column = column + 1;
1269 error.rightmost_column = column + 1;
1270 push_error(error);
1271 // Still, we'll be lenient and keep going, so keep this level in the stack.
1272 indent_stack.push_back(indent_count);
1273 }
1274 }
1275 break; // Get out of the loop in any case.
1276 }
1277}
1278
1279String GDScriptTokenizer::_get_indent_char_name(char32_t ch) {
1280 ERR_FAIL_COND_V(ch != ' ' && ch != '\t', String(&ch, 1).c_escape());
1281
1282 return ch == ' ' ? "space" : "tab";
1283}
1284
1285void GDScriptTokenizer::_skip_whitespace() {
1286 if (pending_indents != 0) {
1287 // Still have some indent/dedent tokens to give.
1288 return;
1289 }
1290
1291 bool is_bol = column == 1; // Beginning of line.
1292
1293 if (is_bol) {
1294 check_indent();
1295 return;
1296 }
1297
1298 for (;;) {
1299 char32_t c = _peek();
1300 switch (c) {
1301 case ' ':
1302 _advance();
1303 break;
1304 case '\t':
1305 _advance();
1306 // Consider individual tab columns.
1307 column += tab_size - 1;
1308 break;
1309 case '\r':
1310 _advance(); // Consume either way.
1311 if (_peek() != '\n') {
1312 push_error("Stray carriage return character in source code.");
1313 return;
1314 }
1315 break;
1316 case '\n':
1317 _advance();
1318 newline(!is_bol); // Don't create new line token if line is empty.
1319 check_indent();
1320 break;
1321 case '#': {
1322 // Comment.
1323#ifdef TOOLS_ENABLED
1324 String comment;
1325 while (_peek() != '\n' && !_is_at_end()) {
1326 comment += _advance();
1327 }
1328 comments[line] = CommentData(comment, is_bol);
1329#else
1330 while (_peek() != '\n' && !_is_at_end()) {
1331 _advance();
1332 }
1333#endif // TOOLS_ENABLED
1334 if (_is_at_end()) {
1335 return;
1336 }
1337 _advance(); // Consume '\n'
1338 newline(!is_bol);
1339 check_indent();
1340 } break;
1341 default:
1342 return;
1343 }
1344 }
1345}
1346
1347GDScriptTokenizer::Token GDScriptTokenizer::scan() {
1348 if (has_error()) {
1349 return pop_error();
1350 }
1351
1352 _skip_whitespace();
1353
1354 if (pending_newline) {
1355 pending_newline = false;
1356 if (!multiline_mode) {
1357 // Don't return newline tokens on multiline mode.
1358 return last_newline;
1359 }
1360 }
1361
1362 // Check for potential errors after skipping whitespace().
1363 if (has_error()) {
1364 return pop_error();
1365 }
1366
1367 _start = _current;
1368 start_line = line;
1369 start_column = column;
1370 leftmost_column = column;
1371 rightmost_column = column;
1372
1373 if (pending_indents != 0) {
1374 // Adjust position for indent.
1375 _start -= start_column - 1;
1376 start_column = 1;
1377 leftmost_column = 1;
1378 if (pending_indents > 0) {
1379 // Indents.
1380 pending_indents--;
1381 return make_token(Token::INDENT);
1382 } else {
1383 // Dedents.
1384 pending_indents++;
1385 Token dedent = make_token(Token::DEDENT);
1386 dedent.end_column += 1;
1387 dedent.rightmost_column += 1;
1388 return dedent;
1389 }
1390 }
1391
1392 if (_is_at_end()) {
1393 return make_token(Token::TK_EOF);
1394 }
1395
1396 const char32_t c = _advance();
1397
1398 if (c == '\\') {
1399 // Line continuation with backslash.
1400 if (_peek() == '\r') {
1401 if (_peek(1) != '\n') {
1402 return make_error("Unexpected carriage return character.");
1403 }
1404 _advance();
1405 }
1406 if (_peek() != '\n') {
1407 return make_error("Expected new line after \"\\\".");
1408 }
1409 _advance();
1410 newline(false);
1411 line_continuation = true;
1412 return scan(); // Recurse to get next token.
1413 }
1414
1415 line_continuation = false;
1416
1417 if (is_digit(c)) {
1418 return number();
1419 } else if (is_unicode_identifier_start(c)) {
1420 return potential_identifier();
1421 }
1422
1423 switch (c) {
1424 // String literals.
1425 case '"':
1426 case '\'':
1427 return string();
1428
1429 // Annotation.
1430 case '@':
1431 return annotation();
1432
1433 // Single characters.
1434 case '~':
1435 return make_token(Token::TILDE);
1436 case ',':
1437 return make_token(Token::COMMA);
1438 case ':':
1439 return make_token(Token::COLON);
1440 case ';':
1441 return make_token(Token::SEMICOLON);
1442 case '$':
1443 return make_token(Token::DOLLAR);
1444 case '?':
1445 return make_token(Token::QUESTION_MARK);
1446 case '`':
1447 return make_token(Token::BACKTICK);
1448
1449 // Parens.
1450 case '(':
1451 push_paren('(');
1452 return make_token(Token::PARENTHESIS_OPEN);
1453 case '[':
1454 push_paren('[');
1455 return make_token(Token::BRACKET_OPEN);
1456 case '{':
1457 push_paren('{');
1458 return make_token(Token::BRACE_OPEN);
1459 case ')':
1460 if (!pop_paren('(')) {
1461 return make_paren_error(c);
1462 }
1463 return make_token(Token::PARENTHESIS_CLOSE);
1464 case ']':
1465 if (!pop_paren('[')) {
1466 return make_paren_error(c);
1467 }
1468 return make_token(Token::BRACKET_CLOSE);
1469 case '}':
1470 if (!pop_paren('{')) {
1471 return make_paren_error(c);
1472 }
1473 return make_token(Token::BRACE_CLOSE);
1474
1475 // Double characters.
1476 case '!':
1477 if (_peek() == '=') {
1478 _advance();
1479 return make_token(Token::BANG_EQUAL);
1480 } else {
1481 return make_token(Token::BANG);
1482 }
1483 case '.':
1484 if (_peek() == '.') {
1485 _advance();
1486 return make_token(Token::PERIOD_PERIOD);
1487 } else if (is_digit(_peek())) {
1488 // Number starting with '.'.
1489 return number();
1490 } else {
1491 return make_token(Token::PERIOD);
1492 }
1493 case '+':
1494 if (_peek() == '=') {
1495 _advance();
1496 return make_token(Token::PLUS_EQUAL);
1497 } else if (is_digit(_peek()) && !last_token.can_precede_bin_op()) {
1498 // Number starting with '+'.
1499 return number();
1500 } else {
1501 return make_token(Token::PLUS);
1502 }
1503 case '-':
1504 if (_peek() == '=') {
1505 _advance();
1506 return make_token(Token::MINUS_EQUAL);
1507 } else if (is_digit(_peek()) && !last_token.can_precede_bin_op()) {
1508 // Number starting with '-'.
1509 return number();
1510 } else if (_peek() == '>') {
1511 _advance();
1512 return make_token(Token::FORWARD_ARROW);
1513 } else {
1514 return make_token(Token::MINUS);
1515 }
1516 case '*':
1517 if (_peek() == '=') {
1518 _advance();
1519 return make_token(Token::STAR_EQUAL);
1520 } else if (_peek() == '*') {
1521 if (_peek(1) == '=') {
1522 _advance();
1523 _advance(); // Advance both '*' and '='
1524 return make_token(Token::STAR_STAR_EQUAL);
1525 }
1526 _advance();
1527 return make_token(Token::STAR_STAR);
1528 } else {
1529 return make_token(Token::STAR);
1530 }
1531 case '/':
1532 if (_peek() == '=') {
1533 _advance();
1534 return make_token(Token::SLASH_EQUAL);
1535 } else {
1536 return make_token(Token::SLASH);
1537 }
1538 case '%':
1539 if (_peek() == '=') {
1540 _advance();
1541 return make_token(Token::PERCENT_EQUAL);
1542 } else {
1543 return make_token(Token::PERCENT);
1544 }
1545 case '^':
1546 if (_peek() == '=') {
1547 _advance();
1548 return make_token(Token::CARET_EQUAL);
1549 } else if (_peek() == '"' || _peek() == '\'') {
1550 // Node path
1551 return string();
1552 } else {
1553 return make_token(Token::CARET);
1554 }
1555 case '&':
1556 if (_peek() == '&') {
1557 _advance();
1558 return make_token(Token::AMPERSAND_AMPERSAND);
1559 } else if (_peek() == '=') {
1560 _advance();
1561 return make_token(Token::AMPERSAND_EQUAL);
1562 } else if (_peek() == '"' || _peek() == '\'') {
1563 // String Name
1564 return string();
1565 } else {
1566 return make_token(Token::AMPERSAND);
1567 }
1568 case '|':
1569 if (_peek() == '|') {
1570 _advance();
1571 return make_token(Token::PIPE_PIPE);
1572 } else if (_peek() == '=') {
1573 _advance();
1574 return make_token(Token::PIPE_EQUAL);
1575 } else {
1576 return make_token(Token::PIPE);
1577 }
1578
1579 // Potential VCS conflict markers.
1580 case '=':
1581 if (_peek() == '=') {
1582 return check_vcs_marker('=', Token::EQUAL_EQUAL);
1583 } else {
1584 return make_token(Token::EQUAL);
1585 }
1586 case '<':
1587 if (_peek() == '=') {
1588 _advance();
1589 return make_token(Token::LESS_EQUAL);
1590 } else if (_peek() == '<') {
1591 if (_peek(1) == '=') {
1592 _advance();
1593 _advance(); // Advance both '<' and '='
1594 return make_token(Token::LESS_LESS_EQUAL);
1595 } else {
1596 return check_vcs_marker('<', Token::LESS_LESS);
1597 }
1598 } else {
1599 return make_token(Token::LESS);
1600 }
1601 case '>':
1602 if (_peek() == '=') {
1603 _advance();
1604 return make_token(Token::GREATER_EQUAL);
1605 } else if (_peek() == '>') {
1606 if (_peek(1) == '=') {
1607 _advance();
1608 _advance(); // Advance both '>' and '='
1609 return make_token(Token::GREATER_GREATER_EQUAL);
1610 } else {
1611 return check_vcs_marker('>', Token::GREATER_GREATER);
1612 }
1613 } else {
1614 return make_token(Token::GREATER);
1615 }
1616
1617 default:
1618 if (is_whitespace(c)) {
1619 return make_error(vformat(R"(Invalid white space character U+%04X.)", static_cast<int32_t>(c)));
1620 } else {
1621 return make_error(vformat(R"(Invalid character "%c" (U+%04X).)", c, static_cast<int32_t>(c)));
1622 }
1623 }
1624}
1625
1626GDScriptTokenizer::GDScriptTokenizer() {
1627#ifdef TOOLS_ENABLED
1628 if (EditorSettings::get_singleton()) {
1629 tab_size = EditorSettings::get_singleton()->get_setting("text_editor/behavior/indent/size");
1630 }
1631#endif // TOOLS_ENABLED
1632#ifdef DEBUG_ENABLED
1633 make_keyword_list();
1634#endif // DEBUG_ENABLED
1635}
1636