1 | /**************************************************************************/ |
2 | /* gdscript_tokenizer.cpp */ |
3 | /**************************************************************************/ |
4 | /* This file is part of: */ |
5 | /* GODOT ENGINE */ |
6 | /* https://godotengine.org */ |
7 | /**************************************************************************/ |
8 | /* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ |
9 | /* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ |
10 | /* */ |
11 | /* Permission is hereby granted, free of charge, to any person obtaining */ |
12 | /* a copy of this software and associated documentation files (the */ |
13 | /* "Software"), to deal in the Software without restriction, including */ |
14 | /* without limitation the rights to use, copy, modify, merge, publish, */ |
15 | /* distribute, sublicense, and/or sell copies of the Software, and to */ |
16 | /* permit persons to whom the Software is furnished to do so, subject to */ |
17 | /* the following conditions: */ |
18 | /* */ |
19 | /* The above copyright notice and this permission notice shall be */ |
20 | /* included in all copies or substantial portions of the Software. */ |
21 | /* */ |
22 | /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ |
23 | /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ |
24 | /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ |
25 | /* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ |
26 | /* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ |
27 | /* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ |
28 | /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ |
29 | /**************************************************************************/ |
30 | |
31 | #include "gdscript_tokenizer.h" |
32 | |
33 | #include "core/error/error_macros.h" |
34 | #include "core/string/char_utils.h" |
35 | |
36 | #ifdef DEBUG_ENABLED |
37 | #include "servers/text_server.h" |
38 | #endif |
39 | |
40 | #ifdef TOOLS_ENABLED |
41 | #include "editor/editor_settings.h" |
42 | #endif |
43 | |
44 | static const char *token_names[] = { |
45 | "Empty" , // EMPTY, |
46 | // Basic |
47 | "Annotation" , // ANNOTATION |
48 | "Identifier" , // IDENTIFIER, |
49 | "Literal" , // LITERAL, |
50 | // Comparison |
51 | "<" , // LESS, |
52 | "<=" , // LESS_EQUAL, |
53 | ">" , // GREATER, |
54 | ">=" , // GREATER_EQUAL, |
55 | "==" , // EQUAL_EQUAL, |
56 | "!=" , // BANG_EQUAL, |
57 | // Logical |
58 | "and" , // AND, |
59 | "or" , // OR, |
60 | "not" , // NOT, |
61 | "&&" , // AMPERSAND_AMPERSAND, |
62 | "||" , // PIPE_PIPE, |
63 | "!" , // BANG, |
64 | // Bitwise |
65 | "&" , // AMPERSAND, |
66 | "|" , // PIPE, |
67 | "~" , // TILDE, |
68 | "^" , // CARET, |
69 | "<<" , // LESS_LESS, |
70 | ">>" , // GREATER_GREATER, |
71 | // Math |
72 | "+" , // PLUS, |
73 | "-" , // MINUS, |
74 | "*" , // STAR, |
75 | "**" , // STAR_STAR, |
76 | "/" , // SLASH, |
77 | "%" , // PERCENT, |
78 | // Assignment |
79 | "=" , // EQUAL, |
80 | "+=" , // PLUS_EQUAL, |
81 | "-=" , // MINUS_EQUAL, |
82 | "*=" , // STAR_EQUAL, |
83 | "**=" , // STAR_STAR_EQUAL, |
84 | "/=" , // SLASH_EQUAL, |
85 | "%=" , // PERCENT_EQUAL, |
86 | "<<=" , // LESS_LESS_EQUAL, |
87 | ">>=" , // GREATER_GREATER_EQUAL, |
88 | "&=" , // AMPERSAND_EQUAL, |
89 | "|=" , // PIPE_EQUAL, |
90 | "^=" , // CARET_EQUAL, |
91 | // Control flow |
92 | "if" , // IF, |
93 | "elif" , // ELIF, |
94 | "else" , // ELSE, |
95 | "for" , // FOR, |
96 | "while" , // WHILE, |
97 | "break" , // BREAK, |
98 | "continue" , // CONTINUE, |
99 | "pass" , // PASS, |
100 | "return" , // RETURN, |
101 | "match" , // MATCH, |
102 | // Keywords |
103 | "as" , // AS, |
104 | "assert" , // ASSERT, |
105 | "await" , // AWAIT, |
106 | "breakpoint" , // BREAKPOINT, |
107 | "class" , // CLASS, |
108 | "class_name" , // CLASS_NAME, |
109 | "const" , // CONST, |
110 | "enum" , // ENUM, |
111 | "extends" , // EXTENDS, |
112 | "func" , // FUNC, |
113 | "in" , // IN, |
114 | "is" , // IS, |
115 | "namespace" , // NAMESPACE |
116 | "preload" , // PRELOAD, |
117 | "self" , // SELF, |
118 | "signal" , // SIGNAL, |
119 | "static" , // STATIC, |
120 | "super" , // SUPER, |
121 | "trait" , // TRAIT, |
122 | "var" , // VAR, |
123 | "void" , // VOID, |
124 | "yield" , // YIELD, |
125 | // Punctuation |
126 | "[" , // BRACKET_OPEN, |
127 | "]" , // BRACKET_CLOSE, |
128 | "{" , // BRACE_OPEN, |
129 | "}" , // BRACE_CLOSE, |
130 | "(" , // PARENTHESIS_OPEN, |
131 | ")" , // PARENTHESIS_CLOSE, |
132 | "," , // COMMA, |
133 | ";" , // SEMICOLON, |
134 | "." , // PERIOD, |
135 | ".." , // PERIOD_PERIOD, |
136 | ":" , // COLON, |
137 | "$" , // DOLLAR, |
138 | "->" , // FORWARD_ARROW, |
139 | "_" , // UNDERSCORE, |
140 | // Whitespace |
141 | "Newline" , // NEWLINE, |
142 | "Indent" , // INDENT, |
143 | "Dedent" , // DEDENT, |
144 | // Constants |
145 | "PI" , // CONST_PI, |
146 | "TAU" , // CONST_TAU, |
147 | "INF" , // CONST_INF, |
148 | "NaN" , // CONST_NAN, |
149 | // Error message improvement |
150 | "VCS conflict marker" , // VCS_CONFLICT_MARKER, |
151 | "`" , // BACKTICK, |
152 | "?" , // QUESTION_MARK, |
153 | // Special |
154 | "Error" , // ERROR, |
155 | "End of file" , // EOF, |
156 | }; |
157 | |
158 | // Avoid desync. |
159 | static_assert(sizeof(token_names) / sizeof(token_names[0]) == GDScriptTokenizer::Token::TK_MAX, "Amount of token names don't match the amount of token types." ); |
160 | |
161 | const char *GDScriptTokenizer::Token::get_name() const { |
162 | ERR_FAIL_INDEX_V_MSG(type, TK_MAX, "<error>" , "Using token type out of the enum." ); |
163 | return token_names[type]; |
164 | } |
165 | |
166 | bool GDScriptTokenizer::Token::can_precede_bin_op() const { |
167 | switch (type) { |
168 | case IDENTIFIER: |
169 | case LITERAL: |
170 | case SELF: |
171 | case BRACKET_CLOSE: |
172 | case BRACE_CLOSE: |
173 | case PARENTHESIS_CLOSE: |
174 | case CONST_PI: |
175 | case CONST_TAU: |
176 | case CONST_INF: |
177 | case CONST_NAN: |
178 | return true; |
179 | default: |
180 | return false; |
181 | } |
182 | } |
183 | |
184 | bool GDScriptTokenizer::Token::is_identifier() const { |
185 | // Note: Most keywords should not be recognized as identifiers. |
186 | // These are only exceptions for stuff that already is on the engine's API. |
187 | switch (type) { |
188 | case IDENTIFIER: |
189 | case MATCH: // Used in String.match(). |
190 | // Allow constants to be treated as regular identifiers. |
191 | case CONST_PI: |
192 | case CONST_INF: |
193 | case CONST_NAN: |
194 | case CONST_TAU: |
195 | return true; |
196 | default: |
197 | return false; |
198 | } |
199 | } |
200 | |
201 | bool GDScriptTokenizer::Token::is_node_name() const { |
202 | // This is meant to allow keywords with the $ notation, but not as general identifiers. |
203 | switch (type) { |
204 | case IDENTIFIER: |
205 | case AND: |
206 | case AS: |
207 | case ASSERT: |
208 | case AWAIT: |
209 | case BREAK: |
210 | case BREAKPOINT: |
211 | case CLASS_NAME: |
212 | case CLASS: |
213 | case CONST: |
214 | case CONST_PI: |
215 | case CONST_INF: |
216 | case CONST_NAN: |
217 | case CONST_TAU: |
218 | case CONTINUE: |
219 | case ELIF: |
220 | case ELSE: |
221 | case ENUM: |
222 | case EXTENDS: |
223 | case FOR: |
224 | case FUNC: |
225 | case IF: |
226 | case IN: |
227 | case IS: |
228 | case MATCH: |
229 | case NAMESPACE: |
230 | case NOT: |
231 | case OR: |
232 | case PASS: |
233 | case PRELOAD: |
234 | case RETURN: |
235 | case SELF: |
236 | case SIGNAL: |
237 | case STATIC: |
238 | case SUPER: |
239 | case TRAIT: |
240 | case UNDERSCORE: |
241 | case VAR: |
242 | case VOID: |
243 | case WHILE: |
244 | case YIELD: |
245 | return true; |
246 | default: |
247 | return false; |
248 | } |
249 | } |
250 | |
251 | String GDScriptTokenizer::get_token_name(Token::Type p_token_type) { |
252 | ERR_FAIL_INDEX_V_MSG(p_token_type, Token::TK_MAX, "<error>" , "Using token type out of the enum." ); |
253 | return token_names[p_token_type]; |
254 | } |
255 | |
256 | void GDScriptTokenizer::set_source_code(const String &p_source_code) { |
257 | source = p_source_code; |
258 | if (source.is_empty()) { |
259 | _source = U"" ; |
260 | } else { |
261 | _source = source.ptr(); |
262 | } |
263 | _current = _source; |
264 | line = 1; |
265 | column = 1; |
266 | length = p_source_code.length(); |
267 | position = 0; |
268 | } |
269 | |
270 | void GDScriptTokenizer::set_cursor_position(int p_line, int p_column) { |
271 | cursor_line = p_line; |
272 | cursor_column = p_column; |
273 | } |
274 | |
275 | void GDScriptTokenizer::set_multiline_mode(bool p_state) { |
276 | multiline_mode = p_state; |
277 | } |
278 | |
279 | void GDScriptTokenizer::push_expression_indented_block() { |
280 | indent_stack_stack.push_back(indent_stack); |
281 | } |
282 | |
283 | void GDScriptTokenizer::pop_expression_indented_block() { |
284 | ERR_FAIL_COND(indent_stack_stack.size() == 0); |
285 | indent_stack = indent_stack_stack.back()->get(); |
286 | indent_stack_stack.pop_back(); |
287 | } |
288 | |
289 | int GDScriptTokenizer::get_cursor_line() const { |
290 | return cursor_line; |
291 | } |
292 | |
293 | int GDScriptTokenizer::get_cursor_column() const { |
294 | return cursor_column; |
295 | } |
296 | |
297 | bool GDScriptTokenizer::is_past_cursor() const { |
298 | if (line < cursor_line) { |
299 | return false; |
300 | } |
301 | if (line > cursor_line) { |
302 | return true; |
303 | } |
304 | if (column < cursor_column) { |
305 | return false; |
306 | } |
307 | return true; |
308 | } |
309 | |
310 | char32_t GDScriptTokenizer::_advance() { |
311 | if (unlikely(_is_at_end())) { |
312 | return '\0'; |
313 | } |
314 | _current++; |
315 | column++; |
316 | position++; |
317 | if (column > rightmost_column) { |
318 | rightmost_column = column; |
319 | } |
320 | if (unlikely(_is_at_end())) { |
321 | // Add extra newline even if it's not there, to satisfy the parser. |
322 | newline(true); |
323 | // Also add needed unindent. |
324 | check_indent(); |
325 | } |
326 | return _peek(-1); |
327 | } |
328 | |
329 | void GDScriptTokenizer::push_paren(char32_t p_char) { |
330 | paren_stack.push_back(p_char); |
331 | } |
332 | |
333 | bool GDScriptTokenizer::pop_paren(char32_t p_expected) { |
334 | if (paren_stack.is_empty()) { |
335 | return false; |
336 | } |
337 | char32_t actual = paren_stack.back()->get(); |
338 | paren_stack.pop_back(); |
339 | |
340 | return actual == p_expected; |
341 | } |
342 | |
343 | GDScriptTokenizer::Token GDScriptTokenizer::pop_error() { |
344 | Token error = error_stack.back()->get(); |
345 | error_stack.pop_back(); |
346 | return error; |
347 | } |
348 | |
349 | GDScriptTokenizer::Token GDScriptTokenizer::make_token(Token::Type p_type) { |
350 | Token token(p_type); |
351 | token.start_line = start_line; |
352 | token.end_line = line; |
353 | token.start_column = start_column; |
354 | token.end_column = column; |
355 | token.leftmost_column = leftmost_column; |
356 | token.rightmost_column = rightmost_column; |
357 | token.source = String(_start, _current - _start); |
358 | |
359 | if (p_type != Token::ERROR && cursor_line > -1) { |
360 | // Also count whitespace after token. |
361 | int offset = 0; |
362 | while (_peek(offset) == ' ' || _peek(offset) == '\t') { |
363 | offset++; |
364 | } |
365 | int last_column = column + offset; |
366 | // Check cursor position in token. |
367 | if (start_line == line) { |
368 | // Single line token. |
369 | if (cursor_line == start_line && cursor_column >= start_column && cursor_column <= last_column) { |
370 | token.cursor_position = cursor_column - start_column; |
371 | if (cursor_column == start_column) { |
372 | token.cursor_place = CURSOR_BEGINNING; |
373 | } else if (cursor_column < column) { |
374 | token.cursor_place = CURSOR_MIDDLE; |
375 | } else { |
376 | token.cursor_place = CURSOR_END; |
377 | } |
378 | } |
379 | } else { |
380 | // Multi line token. |
381 | if (cursor_line == start_line && cursor_column >= start_column) { |
382 | // Is in first line. |
383 | token.cursor_position = cursor_column - start_column; |
384 | if (cursor_column == start_column) { |
385 | token.cursor_place = CURSOR_BEGINNING; |
386 | } else { |
387 | token.cursor_place = CURSOR_MIDDLE; |
388 | } |
389 | } else if (cursor_line == line && cursor_column <= last_column) { |
390 | // Is in last line. |
391 | token.cursor_position = cursor_column - start_column; |
392 | if (cursor_column < column) { |
393 | token.cursor_place = CURSOR_MIDDLE; |
394 | } else { |
395 | token.cursor_place = CURSOR_END; |
396 | } |
397 | } else if (cursor_line > start_line && cursor_line < line) { |
398 | // Is in middle line. |
399 | token.cursor_position = CURSOR_MIDDLE; |
400 | } |
401 | } |
402 | } |
403 | |
404 | last_token = token; |
405 | return token; |
406 | } |
407 | |
408 | GDScriptTokenizer::Token GDScriptTokenizer::make_literal(const Variant &p_literal) { |
409 | Token token = make_token(Token::LITERAL); |
410 | token.literal = p_literal; |
411 | return token; |
412 | } |
413 | |
414 | GDScriptTokenizer::Token GDScriptTokenizer::make_identifier(const StringName &p_identifier) { |
415 | Token identifier = make_token(Token::IDENTIFIER); |
416 | identifier.literal = p_identifier; |
417 | return identifier; |
418 | } |
419 | |
420 | GDScriptTokenizer::Token GDScriptTokenizer::make_error(const String &p_message) { |
421 | Token error = make_token(Token::ERROR); |
422 | error.literal = p_message; |
423 | |
424 | return error; |
425 | } |
426 | |
427 | void GDScriptTokenizer::push_error(const String &p_message) { |
428 | Token error = make_error(p_message); |
429 | error_stack.push_back(error); |
430 | } |
431 | |
432 | void GDScriptTokenizer::push_error(const Token &p_error) { |
433 | error_stack.push_back(p_error); |
434 | } |
435 | |
436 | GDScriptTokenizer::Token GDScriptTokenizer::make_paren_error(char32_t p_paren) { |
437 | if (paren_stack.is_empty()) { |
438 | return make_error(vformat("Closing \"%c\" doesn't have an opening counterpart." , p_paren)); |
439 | } |
440 | Token error = make_error(vformat("Closing \"%c\" doesn't match the opening \"%c\"." , p_paren, paren_stack.back()->get())); |
441 | paren_stack.pop_back(); // Remove opening one anyway. |
442 | return error; |
443 | } |
444 | |
445 | GDScriptTokenizer::Token GDScriptTokenizer::check_vcs_marker(char32_t p_test, Token::Type p_double_type) { |
446 | const char32_t *next = _current + 1; |
447 | int chars = 2; // Two already matched. |
448 | |
449 | // Test before consuming characters, since we don't want to consume more than needed. |
450 | while (*next == p_test) { |
451 | chars++; |
452 | next++; |
453 | } |
454 | if (chars >= 7) { |
455 | // It is a VCS conflict marker. |
456 | while (chars > 1) { |
457 | // Consume all characters (first was already consumed by scan()). |
458 | _advance(); |
459 | chars--; |
460 | } |
461 | return make_token(Token::VCS_CONFLICT_MARKER); |
462 | } else { |
463 | // It is only a regular double character token, so we consume the second character. |
464 | _advance(); |
465 | return make_token(p_double_type); |
466 | } |
467 | } |
468 | |
469 | GDScriptTokenizer::Token GDScriptTokenizer::annotation() { |
470 | if (is_unicode_identifier_start(_peek())) { |
471 | _advance(); // Consume start character. |
472 | } else { |
473 | push_error("Expected annotation identifier after \"@\"." ); |
474 | } |
475 | while (is_unicode_identifier_continue(_peek())) { |
476 | // Consume all identifier characters. |
477 | _advance(); |
478 | } |
479 | Token annotation = make_token(Token::ANNOTATION); |
480 | annotation.literal = StringName(annotation.source); |
481 | return annotation; |
482 | } |
483 | |
484 | #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ |
485 | KEYWORD_GROUP('a') \ |
486 | KEYWORD("as", Token::AS) \ |
487 | KEYWORD("and", Token::AND) \ |
488 | KEYWORD("assert", Token::ASSERT) \ |
489 | KEYWORD("await", Token::AWAIT) \ |
490 | KEYWORD_GROUP('b') \ |
491 | KEYWORD("break", Token::BREAK) \ |
492 | KEYWORD("breakpoint", Token::BREAKPOINT) \ |
493 | KEYWORD_GROUP('c') \ |
494 | KEYWORD("class", Token::CLASS) \ |
495 | KEYWORD("class_name", Token::CLASS_NAME) \ |
496 | KEYWORD("const", Token::CONST) \ |
497 | KEYWORD("continue", Token::CONTINUE) \ |
498 | KEYWORD_GROUP('e') \ |
499 | KEYWORD("elif", Token::ELIF) \ |
500 | KEYWORD("else", Token::ELSE) \ |
501 | KEYWORD("enum", Token::ENUM) \ |
502 | KEYWORD("extends", Token::EXTENDS) \ |
503 | KEYWORD_GROUP('f') \ |
504 | KEYWORD("for", Token::FOR) \ |
505 | KEYWORD("func", Token::FUNC) \ |
506 | KEYWORD_GROUP('i') \ |
507 | KEYWORD("if", Token::IF) \ |
508 | KEYWORD("in", Token::IN) \ |
509 | KEYWORD("is", Token::IS) \ |
510 | KEYWORD_GROUP('m') \ |
511 | KEYWORD("match", Token::MATCH) \ |
512 | KEYWORD_GROUP('n') \ |
513 | KEYWORD("namespace", Token::NAMESPACE) \ |
514 | KEYWORD("not", Token::NOT) \ |
515 | KEYWORD_GROUP('o') \ |
516 | KEYWORD("or", Token::OR) \ |
517 | KEYWORD_GROUP('p') \ |
518 | KEYWORD("pass", Token::PASS) \ |
519 | KEYWORD("preload", Token::PRELOAD) \ |
520 | KEYWORD_GROUP('r') \ |
521 | KEYWORD("return", Token::RETURN) \ |
522 | KEYWORD_GROUP('s') \ |
523 | KEYWORD("self", Token::SELF) \ |
524 | KEYWORD("signal", Token::SIGNAL) \ |
525 | KEYWORD("static", Token::STATIC) \ |
526 | KEYWORD("super", Token::SUPER) \ |
527 | KEYWORD_GROUP('t') \ |
528 | KEYWORD("trait", Token::TRAIT) \ |
529 | KEYWORD_GROUP('v') \ |
530 | KEYWORD("var", Token::VAR) \ |
531 | KEYWORD("void", Token::VOID) \ |
532 | KEYWORD_GROUP('w') \ |
533 | KEYWORD("while", Token::WHILE) \ |
534 | KEYWORD_GROUP('y') \ |
535 | KEYWORD("yield", Token::YIELD) \ |
536 | KEYWORD_GROUP('I') \ |
537 | KEYWORD("INF", Token::CONST_INF) \ |
538 | KEYWORD_GROUP('N') \ |
539 | KEYWORD("NAN", Token::CONST_NAN) \ |
540 | KEYWORD_GROUP('P') \ |
541 | KEYWORD("PI", Token::CONST_PI) \ |
542 | KEYWORD_GROUP('T') \ |
543 | KEYWORD("TAU", Token::CONST_TAU) |
544 | |
545 | #define MIN_KEYWORD_LENGTH 2 |
546 | #define MAX_KEYWORD_LENGTH 10 |
547 | |
548 | #ifdef DEBUG_ENABLED |
549 | void GDScriptTokenizer::make_keyword_list() { |
550 | #define KEYWORD_LINE(keyword, token_type) keyword, |
551 | #define KEYWORD_GROUP_IGNORE(group) |
552 | keyword_list = { |
553 | KEYWORDS(KEYWORD_GROUP_IGNORE, KEYWORD_LINE) |
554 | }; |
555 | #undef KEYWORD_LINE |
556 | #undef KEYWORD_GROUP_IGNORE |
557 | } |
558 | #endif // DEBUG_ENABLED |
559 | |
560 | GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() { |
561 | bool only_ascii = _peek(-1) < 128; |
562 | |
563 | // Consume all identifier characters. |
564 | while (is_unicode_identifier_continue(_peek())) { |
565 | char32_t c = _advance(); |
566 | only_ascii = only_ascii && c < 128; |
567 | } |
568 | |
569 | int len = _current - _start; |
570 | |
571 | if (len == 1 && _peek(-1) == '_') { |
572 | // Lone underscore. |
573 | return make_token(Token::UNDERSCORE); |
574 | } |
575 | |
576 | String name(_start, len); |
577 | if (len < MIN_KEYWORD_LENGTH || len > MAX_KEYWORD_LENGTH) { |
578 | // Cannot be a keyword, as the length doesn't match any. |
579 | return make_identifier(name); |
580 | } |
581 | |
582 | if (!only_ascii) { |
583 | // Kept here in case the order with push_error matters. |
584 | Token id = make_identifier(name); |
585 | |
586 | #ifdef DEBUG_ENABLED |
587 | // Additional checks for identifiers but only in debug and if it's available in TextServer. |
588 | if (TS->has_feature(TextServer::FEATURE_UNICODE_SECURITY)) { |
589 | int64_t confusable = TS->is_confusable(name, keyword_list); |
590 | if (confusable >= 0) { |
591 | push_error(vformat(R"(Identifier "%s" is visually similar to the GDScript keyword "%s" and thus not allowed.)" , name, keyword_list[confusable])); |
592 | } |
593 | } |
594 | #endif // DEBUG_ENABLED |
595 | |
596 | // Cannot be a keyword, as keywords are ASCII only. |
597 | return id; |
598 | } |
599 | |
600 | // Define some helper macros for the switch case. |
601 | #define KEYWORD_GROUP_CASE(char) \ |
602 | break; \ |
603 | case char: |
604 | #define KEYWORD(keyword, token_type) \ |
605 | { \ |
606 | const int keyword_length = sizeof(keyword) - 1; \ |
607 | static_assert(keyword_length <= MAX_KEYWORD_LENGTH, "There's a keyword longer than the defined maximum length"); \ |
608 | static_assert(keyword_length >= MIN_KEYWORD_LENGTH, "There's a keyword shorter than the defined minimum length"); \ |
609 | if (keyword_length == len && name == keyword) { \ |
610 | return make_token(token_type); \ |
611 | } \ |
612 | } |
613 | |
614 | // Find if it's a keyword. |
615 | switch (_start[0]) { |
616 | default: |
617 | KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD) |
618 | break; |
619 | } |
620 | |
621 | // Check if it's a special literal |
622 | if (len == 4) { |
623 | if (name == "true" ) { |
624 | return make_literal(true); |
625 | } else if (name == "null" ) { |
626 | return make_literal(Variant()); |
627 | } |
628 | } else if (len == 5) { |
629 | if (name == "false" ) { |
630 | return make_literal(false); |
631 | } |
632 | } |
633 | |
634 | // Not a keyword, so must be an identifier. |
635 | return make_identifier(name); |
636 | |
637 | #undef KEYWORD_GROUP_CASE |
638 | #undef KEYWORD |
639 | } |
640 | |
641 | #undef MAX_KEYWORD_LENGTH |
642 | #undef MIN_KEYWORD_LENGTH |
643 | #undef KEYWORDS |
644 | |
645 | void GDScriptTokenizer::newline(bool p_make_token) { |
646 | // Don't overwrite previous newline, nor create if we want a line continuation. |
647 | if (p_make_token && !pending_newline && !line_continuation) { |
648 | Token newline(Token::NEWLINE); |
649 | newline.start_line = line; |
650 | newline.end_line = line; |
651 | newline.start_column = column - 1; |
652 | newline.end_column = column; |
653 | newline.leftmost_column = newline.start_column; |
654 | newline.rightmost_column = newline.end_column; |
655 | pending_newline = true; |
656 | last_token = newline; |
657 | last_newline = newline; |
658 | } |
659 | |
660 | // Increment line/column counters. |
661 | line++; |
662 | column = 1; |
663 | leftmost_column = 1; |
664 | } |
665 | |
666 | GDScriptTokenizer::Token GDScriptTokenizer::number() { |
667 | int base = 10; |
668 | bool has_decimal = false; |
669 | bool has_exponent = false; |
670 | bool has_error = false; |
671 | bool (*digit_check_func)(char32_t) = is_digit; |
672 | |
673 | // Sign before hexadecimal or binary. |
674 | if ((_peek(-1) == '+' || _peek(-1) == '-') && _peek() == '0') { |
675 | _advance(); |
676 | } |
677 | |
678 | if (_peek(-1) == '.') { |
679 | has_decimal = true; |
680 | } else if (_peek(-1) == '0') { |
681 | if (_peek() == 'x') { |
682 | // Hexadecimal. |
683 | base = 16; |
684 | digit_check_func = is_hex_digit; |
685 | _advance(); |
686 | } else if (_peek() == 'b') { |
687 | // Binary. |
688 | base = 2; |
689 | digit_check_func = is_binary_digit; |
690 | _advance(); |
691 | } |
692 | } |
693 | |
694 | if (base != 10 && is_underscore(_peek())) { // Disallow `0x_` and `0b_`. |
695 | Token error = make_error(vformat(R"(Unexpected underscore after "0%c".)" , _peek(-1))); |
696 | error.start_column = column; |
697 | error.leftmost_column = column; |
698 | error.end_column = column + 1; |
699 | error.rightmost_column = column + 1; |
700 | push_error(error); |
701 | has_error = true; |
702 | } |
703 | bool previous_was_underscore = false; // Allow `_` to be used in a number, for readability. |
704 | while (digit_check_func(_peek()) || is_underscore(_peek())) { |
705 | if (is_underscore(_peek())) { |
706 | if (previous_was_underscore) { |
707 | Token error = make_error(R"(Multiple underscores cannot be adjacent in a numeric literal.)" ); |
708 | error.start_column = column; |
709 | error.leftmost_column = column; |
710 | error.end_column = column + 1; |
711 | error.rightmost_column = column + 1; |
712 | push_error(error); |
713 | } |
714 | previous_was_underscore = true; |
715 | } else { |
716 | previous_was_underscore = false; |
717 | } |
718 | _advance(); |
719 | } |
720 | |
721 | // It might be a ".." token (instead of decimal point) so we check if it's not. |
722 | if (_peek() == '.' && _peek(1) != '.') { |
723 | if (base == 10 && !has_decimal) { |
724 | has_decimal = true; |
725 | } else if (base == 10) { |
726 | Token error = make_error("Cannot use a decimal point twice in a number." ); |
727 | error.start_column = column; |
728 | error.leftmost_column = column; |
729 | error.end_column = column + 1; |
730 | error.rightmost_column = column + 1; |
731 | push_error(error); |
732 | has_error = true; |
733 | } else if (base == 16) { |
734 | Token error = make_error("Cannot use a decimal point in a hexadecimal number." ); |
735 | error.start_column = column; |
736 | error.leftmost_column = column; |
737 | error.end_column = column + 1; |
738 | error.rightmost_column = column + 1; |
739 | push_error(error); |
740 | has_error = true; |
741 | } else { |
742 | Token error = make_error("Cannot use a decimal point in a binary number." ); |
743 | error.start_column = column; |
744 | error.leftmost_column = column; |
745 | error.end_column = column + 1; |
746 | error.rightmost_column = column + 1; |
747 | push_error(error); |
748 | has_error = true; |
749 | } |
750 | if (!has_error) { |
751 | _advance(); |
752 | |
753 | // Consume decimal digits. |
754 | if (is_underscore(_peek())) { // Disallow `10._`, but allow `10.`. |
755 | Token error = make_error(R"(Unexpected underscore after decimal point.)" ); |
756 | error.start_column = column; |
757 | error.leftmost_column = column; |
758 | error.end_column = column + 1; |
759 | error.rightmost_column = column + 1; |
760 | push_error(error); |
761 | has_error = true; |
762 | } |
763 | previous_was_underscore = false; |
764 | while (is_digit(_peek()) || is_underscore(_peek())) { |
765 | if (is_underscore(_peek())) { |
766 | if (previous_was_underscore) { |
767 | Token error = make_error(R"(Multiple underscores cannot be adjacent in a numeric literal.)" ); |
768 | error.start_column = column; |
769 | error.leftmost_column = column; |
770 | error.end_column = column + 1; |
771 | error.rightmost_column = column + 1; |
772 | push_error(error); |
773 | } |
774 | previous_was_underscore = true; |
775 | } else { |
776 | previous_was_underscore = false; |
777 | } |
778 | _advance(); |
779 | } |
780 | } |
781 | } |
782 | if (base == 10) { |
783 | if (_peek() == 'e' || _peek() == 'E') { |
784 | has_exponent = true; |
785 | _advance(); |
786 | if (_peek() == '+' || _peek() == '-') { |
787 | // Exponent sign. |
788 | _advance(); |
789 | } |
790 | // Consume exponent digits. |
791 | if (!is_digit(_peek())) { |
792 | Token error = make_error(R"(Expected exponent value after "e".)" ); |
793 | error.start_column = column; |
794 | error.leftmost_column = column; |
795 | error.end_column = column + 1; |
796 | error.rightmost_column = column + 1; |
797 | push_error(error); |
798 | } |
799 | previous_was_underscore = false; |
800 | while (is_digit(_peek()) || is_underscore(_peek())) { |
801 | if (is_underscore(_peek())) { |
802 | if (previous_was_underscore) { |
803 | Token error = make_error(R"(Multiple underscores cannot be adjacent in a numeric literal.)" ); |
804 | error.start_column = column; |
805 | error.leftmost_column = column; |
806 | error.end_column = column + 1; |
807 | error.rightmost_column = column + 1; |
808 | push_error(error); |
809 | } |
810 | previous_was_underscore = true; |
811 | } else { |
812 | previous_was_underscore = false; |
813 | } |
814 | _advance(); |
815 | } |
816 | } |
817 | } |
818 | |
819 | // Detect extra decimal point. |
820 | if (!has_error && has_decimal && _peek() == '.' && _peek(1) != '.') { |
821 | Token error = make_error("Cannot use a decimal point twice in a number." ); |
822 | error.start_column = column; |
823 | error.leftmost_column = column; |
824 | error.end_column = column + 1; |
825 | error.rightmost_column = column + 1; |
826 | push_error(error); |
827 | has_error = true; |
828 | } else if (is_unicode_identifier_start(_peek()) || is_unicode_identifier_continue(_peek())) { |
829 | // Letter at the end of the number. |
830 | push_error("Invalid numeric notation." ); |
831 | } |
832 | |
833 | // Create a string with the whole number. |
834 | int len = _current - _start; |
835 | String number = String(_start, len).replace("_" , "" ); |
836 | |
837 | // Convert to the appropriate literal type. |
838 | if (base == 16) { |
839 | int64_t value = number.hex_to_int(); |
840 | return make_literal(value); |
841 | } else if (base == 2) { |
842 | int64_t value = number.bin_to_int(); |
843 | return make_literal(value); |
844 | } else if (has_decimal || has_exponent) { |
845 | double value = number.to_float(); |
846 | return make_literal(value); |
847 | } else { |
848 | int64_t value = number.to_int(); |
849 | return make_literal(value); |
850 | } |
851 | } |
852 | |
853 | GDScriptTokenizer::Token GDScriptTokenizer::string() { |
854 | enum StringType { |
855 | STRING_REGULAR, |
856 | STRING_NAME, |
857 | STRING_NODEPATH, |
858 | }; |
859 | |
860 | bool is_multiline = false; |
861 | StringType type = STRING_REGULAR; |
862 | |
863 | if (_peek(-1) == '&') { |
864 | type = STRING_NAME; |
865 | _advance(); |
866 | } else if (_peek(-1) == '^') { |
867 | type = STRING_NODEPATH; |
868 | _advance(); |
869 | } |
870 | |
871 | char32_t quote_char = _peek(-1); |
872 | |
873 | if (_peek() == quote_char && _peek(1) == quote_char) { |
874 | is_multiline = true; |
875 | // Consume all quotes. |
876 | _advance(); |
877 | _advance(); |
878 | } |
879 | |
880 | String result; |
881 | char32_t prev = 0; |
882 | int prev_pos = 0; |
883 | |
884 | for (;;) { |
885 | // Consume actual string. |
886 | if (_is_at_end()) { |
887 | return make_error("Unterminated string." ); |
888 | } |
889 | |
890 | char32_t ch = _peek(); |
891 | |
892 | if (ch == 0x200E || ch == 0x200F || (ch >= 0x202A && ch <= 0x202E) || (ch >= 0x2066 && ch <= 0x2069)) { |
893 | Token error = make_error("Invisible text direction control character present in the string, escape it (\"\\u" + String::num_int64(ch, 16) + "\") to avoid confusion." ); |
894 | error.start_column = column; |
895 | error.leftmost_column = error.start_column; |
896 | error.end_column = column + 1; |
897 | error.rightmost_column = error.end_column; |
898 | push_error(error); |
899 | } |
900 | |
901 | if (ch == '\\') { |
902 | // Escape pattern. |
903 | _advance(); |
904 | if (_is_at_end()) { |
905 | return make_error("Unterminated string." ); |
906 | } |
907 | |
908 | // Grab escape character. |
909 | char32_t code = _peek(); |
910 | _advance(); |
911 | if (_is_at_end()) { |
912 | return make_error("Unterminated string." ); |
913 | } |
914 | |
915 | char32_t escaped = 0; |
916 | bool valid_escape = true; |
917 | |
918 | switch (code) { |
919 | case 'a': |
920 | escaped = '\a'; |
921 | break; |
922 | case 'b': |
923 | escaped = '\b'; |
924 | break; |
925 | case 'f': |
926 | escaped = '\f'; |
927 | break; |
928 | case 'n': |
929 | escaped = '\n'; |
930 | break; |
931 | case 'r': |
932 | escaped = '\r'; |
933 | break; |
934 | case 't': |
935 | escaped = '\t'; |
936 | break; |
937 | case 'v': |
938 | escaped = '\v'; |
939 | break; |
940 | case '\'': |
941 | escaped = '\''; |
942 | break; |
943 | case '\"': |
944 | escaped = '\"'; |
945 | break; |
946 | case '\\': |
947 | escaped = '\\'; |
948 | break; |
949 | case 'U': |
950 | case 'u': { |
951 | // Hexadecimal sequence. |
952 | int hex_len = (code == 'U') ? 6 : 4; |
953 | for (int j = 0; j < hex_len; j++) { |
954 | if (_is_at_end()) { |
955 | return make_error("Unterminated string." ); |
956 | } |
957 | |
958 | char32_t digit = _peek(); |
959 | char32_t value = 0; |
960 | if (is_digit(digit)) { |
961 | value = digit - '0'; |
962 | } else if (digit >= 'a' && digit <= 'f') { |
963 | value = digit - 'a'; |
964 | value += 10; |
965 | } else if (digit >= 'A' && digit <= 'F') { |
966 | value = digit - 'A'; |
967 | value += 10; |
968 | } else { |
969 | // Make error, but keep parsing the string. |
970 | Token error = make_error("Invalid hexadecimal digit in unicode escape sequence." ); |
971 | error.start_column = column; |
972 | error.leftmost_column = error.start_column; |
973 | error.end_column = column + 1; |
974 | error.rightmost_column = error.end_column; |
975 | push_error(error); |
976 | valid_escape = false; |
977 | break; |
978 | } |
979 | |
980 | escaped <<= 4; |
981 | escaped |= value; |
982 | |
983 | _advance(); |
984 | } |
985 | } break; |
986 | case '\r': |
987 | if (_peek() != '\n') { |
988 | // Carriage return without newline in string. (???) |
989 | // Just add it to the string and keep going. |
990 | result += ch; |
991 | _advance(); |
992 | break; |
993 | } |
994 | [[fallthrough]]; |
995 | case '\n': |
996 | // Escaping newline. |
997 | newline(false); |
998 | valid_escape = false; // Don't add to the string. |
999 | break; |
1000 | default: |
1001 | Token error = make_error("Invalid escape in string." ); |
1002 | error.start_column = column - 2; |
1003 | error.leftmost_column = error.start_column; |
1004 | push_error(error); |
1005 | valid_escape = false; |
1006 | break; |
1007 | } |
1008 | // Parse UTF-16 pair. |
1009 | if (valid_escape) { |
1010 | if ((escaped & 0xfffffc00) == 0xd800) { |
1011 | if (prev == 0) { |
1012 | prev = escaped; |
1013 | prev_pos = column - 2; |
1014 | continue; |
1015 | } else { |
1016 | Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate" ); |
1017 | error.start_column = column - 2; |
1018 | error.leftmost_column = error.start_column; |
1019 | push_error(error); |
1020 | valid_escape = false; |
1021 | prev = 0; |
1022 | } |
1023 | } else if ((escaped & 0xfffffc00) == 0xdc00) { |
1024 | if (prev == 0) { |
1025 | Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate" ); |
1026 | error.start_column = column - 2; |
1027 | error.leftmost_column = error.start_column; |
1028 | push_error(error); |
1029 | valid_escape = false; |
1030 | } else { |
1031 | escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000); |
1032 | prev = 0; |
1033 | } |
1034 | } |
1035 | if (prev != 0) { |
1036 | Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate" ); |
1037 | error.start_column = prev_pos; |
1038 | error.leftmost_column = error.start_column; |
1039 | push_error(error); |
1040 | prev = 0; |
1041 | } |
1042 | } |
1043 | |
1044 | if (valid_escape) { |
1045 | result += escaped; |
1046 | } |
1047 | } else if (ch == quote_char) { |
1048 | if (prev != 0) { |
1049 | Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate" ); |
1050 | error.start_column = prev_pos; |
1051 | error.leftmost_column = error.start_column; |
1052 | push_error(error); |
1053 | prev = 0; |
1054 | } |
1055 | _advance(); |
1056 | if (is_multiline) { |
1057 | if (_peek() == quote_char && _peek(1) == quote_char) { |
1058 | // Ended the multiline string. Consume all quotes. |
1059 | _advance(); |
1060 | _advance(); |
1061 | break; |
1062 | } else { |
1063 | // Not a multiline string termination, add consumed quote. |
1064 | result += quote_char; |
1065 | } |
1066 | } else { |
1067 | // Ended single-line string. |
1068 | break; |
1069 | } |
1070 | } else { |
1071 | if (prev != 0) { |
1072 | Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate" ); |
1073 | error.start_column = prev_pos; |
1074 | error.leftmost_column = error.start_column; |
1075 | push_error(error); |
1076 | prev = 0; |
1077 | } |
1078 | result += ch; |
1079 | _advance(); |
1080 | if (ch == '\n') { |
1081 | newline(false); |
1082 | } |
1083 | } |
1084 | } |
1085 | if (prev != 0) { |
1086 | Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate" ); |
1087 | error.start_column = prev_pos; |
1088 | error.leftmost_column = error.start_column; |
1089 | push_error(error); |
1090 | prev = 0; |
1091 | } |
1092 | |
1093 | // Make the literal. |
1094 | Variant string; |
1095 | switch (type) { |
1096 | case STRING_NAME: |
1097 | string = StringName(result); |
1098 | break; |
1099 | case STRING_NODEPATH: |
1100 | string = NodePath(result); |
1101 | break; |
1102 | case STRING_REGULAR: |
1103 | string = result; |
1104 | break; |
1105 | } |
1106 | |
1107 | return make_literal(string); |
1108 | } |
1109 | |
1110 | void GDScriptTokenizer::check_indent() { |
1111 | ERR_FAIL_COND_MSG(column != 1, "Checking tokenizer indentation in the middle of a line." ); |
1112 | |
1113 | if (_is_at_end()) { |
1114 | // Send dedents for every indent level. |
1115 | pending_indents -= indent_level(); |
1116 | indent_stack.clear(); |
1117 | return; |
1118 | } |
1119 | |
1120 | for (;;) { |
1121 | char32_t current_indent_char = _peek(); |
1122 | int indent_count = 0; |
1123 | |
1124 | if (current_indent_char != ' ' && current_indent_char != '\t' && current_indent_char != '\r' && current_indent_char != '\n' && current_indent_char != '#') { |
1125 | // First character of the line is not whitespace, so we clear all indentation levels. |
1126 | // Unless we are in a continuation or in multiline mode (inside expression). |
1127 | if (line_continuation || multiline_mode) { |
1128 | return; |
1129 | } |
1130 | pending_indents -= indent_level(); |
1131 | indent_stack.clear(); |
1132 | return; |
1133 | } |
1134 | |
1135 | if (_peek() == '\r') { |
1136 | _advance(); |
1137 | if (_peek() != '\n') { |
1138 | push_error("Stray carriage return character in source code." ); |
1139 | } |
1140 | } |
1141 | if (_peek() == '\n') { |
1142 | // Empty line, keep going. |
1143 | _advance(); |
1144 | newline(false); |
1145 | continue; |
1146 | } |
1147 | |
1148 | // Check indent level. |
1149 | bool mixed = false; |
1150 | while (!_is_at_end()) { |
1151 | char32_t space = _peek(); |
1152 | if (space == '\t') { |
1153 | // Consider individual tab columns. |
1154 | column += tab_size - 1; |
1155 | indent_count += tab_size; |
1156 | } else if (space == ' ') { |
1157 | indent_count += 1; |
1158 | } else { |
1159 | break; |
1160 | } |
1161 | mixed = mixed || space != current_indent_char; |
1162 | _advance(); |
1163 | } |
1164 | |
1165 | if (_is_at_end()) { |
1166 | // Reached the end with an empty line, so just dedent as much as needed. |
1167 | pending_indents -= indent_level(); |
1168 | indent_stack.clear(); |
1169 | return; |
1170 | } |
1171 | |
1172 | if (_peek() == '\r') { |
1173 | _advance(); |
1174 | if (_peek() != '\n') { |
1175 | push_error("Stray carriage return character in source code." ); |
1176 | } |
1177 | } |
1178 | if (_peek() == '\n') { |
1179 | // Empty line, keep going. |
1180 | _advance(); |
1181 | newline(false); |
1182 | continue; |
1183 | } |
1184 | if (_peek() == '#') { |
1185 | // Comment. Advance to the next line. |
1186 | #ifdef TOOLS_ENABLED |
1187 | String ; |
1188 | while (_peek() != '\n' && !_is_at_end()) { |
1189 | comment += _advance(); |
1190 | } |
1191 | comments[line] = CommentData(comment, true); |
1192 | #else |
1193 | while (_peek() != '\n' && !_is_at_end()) { |
1194 | _advance(); |
1195 | } |
1196 | #endif // TOOLS_ENABLED |
1197 | if (_is_at_end()) { |
1198 | // Reached the end with an empty line, so just dedent as much as needed. |
1199 | pending_indents -= indent_level(); |
1200 | indent_stack.clear(); |
1201 | return; |
1202 | } |
1203 | _advance(); // Consume '\n'. |
1204 | newline(false); |
1205 | continue; |
1206 | } |
1207 | |
1208 | if (mixed && !line_continuation && !multiline_mode) { |
1209 | Token error = make_error("Mixed use of tabs and spaces for indentation." ); |
1210 | error.start_line = line; |
1211 | error.start_column = 1; |
1212 | error.leftmost_column = 1; |
1213 | error.rightmost_column = column; |
1214 | push_error(error); |
1215 | } |
1216 | |
1217 | if (line_continuation || multiline_mode) { |
1218 | // We cleared up all the whitespace at the beginning of the line. |
1219 | // But if this is a continuation or multiline mode and we don't want any indentation change. |
1220 | return; |
1221 | } |
1222 | |
1223 | // Check if indentation character is consistent. |
1224 | if (indent_char == '\0') { |
1225 | // First time indenting, choose character now. |
1226 | indent_char = current_indent_char; |
1227 | } else if (current_indent_char != indent_char) { |
1228 | Token error = make_error(vformat("Used %s character for indentation instead of %s as used before in the file." , |
1229 | _get_indent_char_name(current_indent_char), _get_indent_char_name(indent_char))); |
1230 | error.start_line = line; |
1231 | error.start_column = 1; |
1232 | error.leftmost_column = 1; |
1233 | error.rightmost_column = column; |
1234 | push_error(error); |
1235 | } |
1236 | |
1237 | // Now we can do actual indentation changes. |
1238 | |
1239 | // Check if indent or dedent. |
1240 | int previous_indent = 0; |
1241 | if (indent_level() > 0) { |
1242 | previous_indent = indent_stack.back()->get(); |
1243 | } |
1244 | if (indent_count == previous_indent) { |
1245 | // No change in indentation. |
1246 | return; |
1247 | } |
1248 | if (indent_count > previous_indent) { |
1249 | // Indentation increased. |
1250 | indent_stack.push_back(indent_count); |
1251 | pending_indents++; |
1252 | } else { |
1253 | // Indentation decreased (dedent). |
1254 | if (indent_level() == 0) { |
1255 | push_error("Tokenizer bug: trying to dedent without previous indent." ); |
1256 | return; |
1257 | } |
1258 | while (indent_level() > 0 && indent_stack.back()->get() > indent_count) { |
1259 | indent_stack.pop_back(); |
1260 | pending_indents--; |
1261 | } |
1262 | if ((indent_level() > 0 && indent_stack.back()->get() != indent_count) || (indent_level() == 0 && indent_count != 0)) { |
1263 | // Mismatched indentation alignment. |
1264 | Token error = make_error("Unindent doesn't match the previous indentation level." ); |
1265 | error.start_line = line; |
1266 | error.start_column = 1; |
1267 | error.leftmost_column = 1; |
1268 | error.end_column = column + 1; |
1269 | error.rightmost_column = column + 1; |
1270 | push_error(error); |
1271 | // Still, we'll be lenient and keep going, so keep this level in the stack. |
1272 | indent_stack.push_back(indent_count); |
1273 | } |
1274 | } |
1275 | break; // Get out of the loop in any case. |
1276 | } |
1277 | } |
1278 | |
1279 | String GDScriptTokenizer::_get_indent_char_name(char32_t ch) { |
1280 | ERR_FAIL_COND_V(ch != ' ' && ch != '\t', String(&ch, 1).c_escape()); |
1281 | |
1282 | return ch == ' ' ? "space" : "tab" ; |
1283 | } |
1284 | |
1285 | void GDScriptTokenizer::_skip_whitespace() { |
1286 | if (pending_indents != 0) { |
1287 | // Still have some indent/dedent tokens to give. |
1288 | return; |
1289 | } |
1290 | |
1291 | bool is_bol = column == 1; // Beginning of line. |
1292 | |
1293 | if (is_bol) { |
1294 | check_indent(); |
1295 | return; |
1296 | } |
1297 | |
1298 | for (;;) { |
1299 | char32_t c = _peek(); |
1300 | switch (c) { |
1301 | case ' ': |
1302 | _advance(); |
1303 | break; |
1304 | case '\t': |
1305 | _advance(); |
1306 | // Consider individual tab columns. |
1307 | column += tab_size - 1; |
1308 | break; |
1309 | case '\r': |
1310 | _advance(); // Consume either way. |
1311 | if (_peek() != '\n') { |
1312 | push_error("Stray carriage return character in source code." ); |
1313 | return; |
1314 | } |
1315 | break; |
1316 | case '\n': |
1317 | _advance(); |
1318 | newline(!is_bol); // Don't create new line token if line is empty. |
1319 | check_indent(); |
1320 | break; |
1321 | case '#': { |
1322 | // Comment. |
1323 | #ifdef TOOLS_ENABLED |
1324 | String ; |
1325 | while (_peek() != '\n' && !_is_at_end()) { |
1326 | comment += _advance(); |
1327 | } |
1328 | comments[line] = CommentData(comment, is_bol); |
1329 | #else |
1330 | while (_peek() != '\n' && !_is_at_end()) { |
1331 | _advance(); |
1332 | } |
1333 | #endif // TOOLS_ENABLED |
1334 | if (_is_at_end()) { |
1335 | return; |
1336 | } |
1337 | _advance(); // Consume '\n' |
1338 | newline(!is_bol); |
1339 | check_indent(); |
1340 | } break; |
1341 | default: |
1342 | return; |
1343 | } |
1344 | } |
1345 | } |
1346 | |
1347 | GDScriptTokenizer::Token GDScriptTokenizer::scan() { |
1348 | if (has_error()) { |
1349 | return pop_error(); |
1350 | } |
1351 | |
1352 | _skip_whitespace(); |
1353 | |
1354 | if (pending_newline) { |
1355 | pending_newline = false; |
1356 | if (!multiline_mode) { |
1357 | // Don't return newline tokens on multiline mode. |
1358 | return last_newline; |
1359 | } |
1360 | } |
1361 | |
1362 | // Check for potential errors after skipping whitespace(). |
1363 | if (has_error()) { |
1364 | return pop_error(); |
1365 | } |
1366 | |
1367 | _start = _current; |
1368 | start_line = line; |
1369 | start_column = column; |
1370 | leftmost_column = column; |
1371 | rightmost_column = column; |
1372 | |
1373 | if (pending_indents != 0) { |
1374 | // Adjust position for indent. |
1375 | _start -= start_column - 1; |
1376 | start_column = 1; |
1377 | leftmost_column = 1; |
1378 | if (pending_indents > 0) { |
1379 | // Indents. |
1380 | pending_indents--; |
1381 | return make_token(Token::INDENT); |
1382 | } else { |
1383 | // Dedents. |
1384 | pending_indents++; |
1385 | Token dedent = make_token(Token::DEDENT); |
1386 | dedent.end_column += 1; |
1387 | dedent.rightmost_column += 1; |
1388 | return dedent; |
1389 | } |
1390 | } |
1391 | |
1392 | if (_is_at_end()) { |
1393 | return make_token(Token::TK_EOF); |
1394 | } |
1395 | |
1396 | const char32_t c = _advance(); |
1397 | |
1398 | if (c == '\\') { |
1399 | // Line continuation with backslash. |
1400 | if (_peek() == '\r') { |
1401 | if (_peek(1) != '\n') { |
1402 | return make_error("Unexpected carriage return character." ); |
1403 | } |
1404 | _advance(); |
1405 | } |
1406 | if (_peek() != '\n') { |
1407 | return make_error("Expected new line after \"\\\"." ); |
1408 | } |
1409 | _advance(); |
1410 | newline(false); |
1411 | line_continuation = true; |
1412 | return scan(); // Recurse to get next token. |
1413 | } |
1414 | |
1415 | line_continuation = false; |
1416 | |
1417 | if (is_digit(c)) { |
1418 | return number(); |
1419 | } else if (is_unicode_identifier_start(c)) { |
1420 | return potential_identifier(); |
1421 | } |
1422 | |
1423 | switch (c) { |
1424 | // String literals. |
1425 | case '"': |
1426 | case '\'': |
1427 | return string(); |
1428 | |
1429 | // Annotation. |
1430 | case '@': |
1431 | return annotation(); |
1432 | |
1433 | // Single characters. |
1434 | case '~': |
1435 | return make_token(Token::TILDE); |
1436 | case ',': |
1437 | return make_token(Token::COMMA); |
1438 | case ':': |
1439 | return make_token(Token::COLON); |
1440 | case ';': |
1441 | return make_token(Token::SEMICOLON); |
1442 | case '$': |
1443 | return make_token(Token::DOLLAR); |
1444 | case '?': |
1445 | return make_token(Token::QUESTION_MARK); |
1446 | case '`': |
1447 | return make_token(Token::BACKTICK); |
1448 | |
1449 | // Parens. |
1450 | case '(': |
1451 | push_paren('('); |
1452 | return make_token(Token::PARENTHESIS_OPEN); |
1453 | case '[': |
1454 | push_paren('['); |
1455 | return make_token(Token::BRACKET_OPEN); |
1456 | case '{': |
1457 | push_paren('{'); |
1458 | return make_token(Token::BRACE_OPEN); |
1459 | case ')': |
1460 | if (!pop_paren('(')) { |
1461 | return make_paren_error(c); |
1462 | } |
1463 | return make_token(Token::PARENTHESIS_CLOSE); |
1464 | case ']': |
1465 | if (!pop_paren('[')) { |
1466 | return make_paren_error(c); |
1467 | } |
1468 | return make_token(Token::BRACKET_CLOSE); |
1469 | case '}': |
1470 | if (!pop_paren('{')) { |
1471 | return make_paren_error(c); |
1472 | } |
1473 | return make_token(Token::BRACE_CLOSE); |
1474 | |
1475 | // Double characters. |
1476 | case '!': |
1477 | if (_peek() == '=') { |
1478 | _advance(); |
1479 | return make_token(Token::BANG_EQUAL); |
1480 | } else { |
1481 | return make_token(Token::BANG); |
1482 | } |
1483 | case '.': |
1484 | if (_peek() == '.') { |
1485 | _advance(); |
1486 | return make_token(Token::PERIOD_PERIOD); |
1487 | } else if (is_digit(_peek())) { |
1488 | // Number starting with '.'. |
1489 | return number(); |
1490 | } else { |
1491 | return make_token(Token::PERIOD); |
1492 | } |
1493 | case '+': |
1494 | if (_peek() == '=') { |
1495 | _advance(); |
1496 | return make_token(Token::PLUS_EQUAL); |
1497 | } else if (is_digit(_peek()) && !last_token.can_precede_bin_op()) { |
1498 | // Number starting with '+'. |
1499 | return number(); |
1500 | } else { |
1501 | return make_token(Token::PLUS); |
1502 | } |
1503 | case '-': |
1504 | if (_peek() == '=') { |
1505 | _advance(); |
1506 | return make_token(Token::MINUS_EQUAL); |
1507 | } else if (is_digit(_peek()) && !last_token.can_precede_bin_op()) { |
1508 | // Number starting with '-'. |
1509 | return number(); |
1510 | } else if (_peek() == '>') { |
1511 | _advance(); |
1512 | return make_token(Token::FORWARD_ARROW); |
1513 | } else { |
1514 | return make_token(Token::MINUS); |
1515 | } |
1516 | case '*': |
1517 | if (_peek() == '=') { |
1518 | _advance(); |
1519 | return make_token(Token::STAR_EQUAL); |
1520 | } else if (_peek() == '*') { |
1521 | if (_peek(1) == '=') { |
1522 | _advance(); |
1523 | _advance(); // Advance both '*' and '=' |
1524 | return make_token(Token::STAR_STAR_EQUAL); |
1525 | } |
1526 | _advance(); |
1527 | return make_token(Token::STAR_STAR); |
1528 | } else { |
1529 | return make_token(Token::STAR); |
1530 | } |
1531 | case '/': |
1532 | if (_peek() == '=') { |
1533 | _advance(); |
1534 | return make_token(Token::SLASH_EQUAL); |
1535 | } else { |
1536 | return make_token(Token::SLASH); |
1537 | } |
1538 | case '%': |
1539 | if (_peek() == '=') { |
1540 | _advance(); |
1541 | return make_token(Token::PERCENT_EQUAL); |
1542 | } else { |
1543 | return make_token(Token::PERCENT); |
1544 | } |
1545 | case '^': |
1546 | if (_peek() == '=') { |
1547 | _advance(); |
1548 | return make_token(Token::CARET_EQUAL); |
1549 | } else if (_peek() == '"' || _peek() == '\'') { |
1550 | // Node path |
1551 | return string(); |
1552 | } else { |
1553 | return make_token(Token::CARET); |
1554 | } |
1555 | case '&': |
1556 | if (_peek() == '&') { |
1557 | _advance(); |
1558 | return make_token(Token::AMPERSAND_AMPERSAND); |
1559 | } else if (_peek() == '=') { |
1560 | _advance(); |
1561 | return make_token(Token::AMPERSAND_EQUAL); |
1562 | } else if (_peek() == '"' || _peek() == '\'') { |
1563 | // String Name |
1564 | return string(); |
1565 | } else { |
1566 | return make_token(Token::AMPERSAND); |
1567 | } |
1568 | case '|': |
1569 | if (_peek() == '|') { |
1570 | _advance(); |
1571 | return make_token(Token::PIPE_PIPE); |
1572 | } else if (_peek() == '=') { |
1573 | _advance(); |
1574 | return make_token(Token::PIPE_EQUAL); |
1575 | } else { |
1576 | return make_token(Token::PIPE); |
1577 | } |
1578 | |
1579 | // Potential VCS conflict markers. |
1580 | case '=': |
1581 | if (_peek() == '=') { |
1582 | return check_vcs_marker('=', Token::EQUAL_EQUAL); |
1583 | } else { |
1584 | return make_token(Token::EQUAL); |
1585 | } |
1586 | case '<': |
1587 | if (_peek() == '=') { |
1588 | _advance(); |
1589 | return make_token(Token::LESS_EQUAL); |
1590 | } else if (_peek() == '<') { |
1591 | if (_peek(1) == '=') { |
1592 | _advance(); |
1593 | _advance(); // Advance both '<' and '=' |
1594 | return make_token(Token::LESS_LESS_EQUAL); |
1595 | } else { |
1596 | return check_vcs_marker('<', Token::LESS_LESS); |
1597 | } |
1598 | } else { |
1599 | return make_token(Token::LESS); |
1600 | } |
1601 | case '>': |
1602 | if (_peek() == '=') { |
1603 | _advance(); |
1604 | return make_token(Token::GREATER_EQUAL); |
1605 | } else if (_peek() == '>') { |
1606 | if (_peek(1) == '=') { |
1607 | _advance(); |
1608 | _advance(); // Advance both '>' and '=' |
1609 | return make_token(Token::GREATER_GREATER_EQUAL); |
1610 | } else { |
1611 | return check_vcs_marker('>', Token::GREATER_GREATER); |
1612 | } |
1613 | } else { |
1614 | return make_token(Token::GREATER); |
1615 | } |
1616 | |
1617 | default: |
1618 | if (is_whitespace(c)) { |
1619 | return make_error(vformat(R"(Invalid white space character U+%04X.)" , static_cast<int32_t>(c))); |
1620 | } else { |
1621 | return make_error(vformat(R"(Invalid character "%c" (U+%04X).)" , c, static_cast<int32_t>(c))); |
1622 | } |
1623 | } |
1624 | } |
1625 | |
1626 | GDScriptTokenizer::GDScriptTokenizer() { |
1627 | #ifdef TOOLS_ENABLED |
1628 | if (EditorSettings::get_singleton()) { |
1629 | tab_size = EditorSettings::get_singleton()->get_setting("text_editor/behavior/indent/size" ); |
1630 | } |
1631 | #endif // TOOLS_ENABLED |
1632 | #ifdef DEBUG_ENABLED |
1633 | make_keyword_list(); |
1634 | #endif // DEBUG_ENABLED |
1635 | } |
1636 | |