lexer.c source code [MicroPython/py/lexer.c]

1	/*
2	* This file is part of the MicroPython project, http://micropython.org/
3	*
4	* The MIT License (MIT)
5	*
6	* Copyright (c) 2013, 2014 Damien P. George
7	*
8	* Permission is hereby granted, free of charge, to any person obtaining a copy
9	* of this software and associated documentation files (the "Software"), to deal
10	* in the Software without restriction, including without limitation the rights
11	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12	* copies of the Software, and to permit persons to whom the Software is
13	* furnished to do so, subject to the following conditions:
14	*
15	* The above copyright notice and this permission notice shall be included in
16	* all copies or substantial portions of the Software.
17	*
18	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24	* THE SOFTWARE.
25	*/
26
27	#include <stdio.h>
28	#include <string.h>
29	#include <assert.h>
30
31	#include "py/reader.h"
32	#include "py/lexer.h"
33	#include "py/runtime.h"
34
35	#if MICROPY_ENABLE_COMPILER
36
37	#define TAB_SIZE (8)
38
39	// TODO seems that CPython allows NULL byte in the input stream
40	// don't know if that's intentional or not, but we don't allow it
41
42	#define MP_LEXER_EOF ((unichar)MP_READER_EOF)
43	#define CUR_CHAR(lex) ((lex)->chr0)
44
45	STATIC bool is_end(mp_lexer_t *lex) {
46	return lex->chr0 == MP_LEXER_EOF;
47	}
48
49	STATIC bool is_physical_newline(mp_lexer_t *lex) {
50	return lex->chr0 == `'\n'`;
51	}
52
53	STATIC bool is_char(mp_lexer_t *lex, byte c) {
54	return lex->chr0 == c;
55	}
56
57	STATIC bool is_char_or(mp_lexer_t *lex, byte c1, byte c2) {
58	return lex->chr0 == c1 \|\| lex->chr0 == c2;
59	}
60
61	STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
62	return lex->chr0 == c1 \|\| lex->chr0 == c2 \|\| lex->chr0 == c3;
63	}
64
65	STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
66	return lex->chr1 == c;
67	}
68
69	STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
70	return lex->chr1 == c1 \|\| lex->chr1 == c2;
71	}
72
73	STATIC bool is_char_following_following_or(mp_lexer_t *lex, byte c1, byte c2) {
74	return lex->chr2 == c1 \|\| lex->chr2 == c2;
75	}
76
77	STATIC bool is_char_and(mp_lexer_t *lex, byte c1, byte c2) {
78	return lex->chr0 == c1 && lex->chr1 == c2;
79	}
80
81	STATIC bool is_whitespace(mp_lexer_t *lex) {
82	return unichar_isspace(lex->chr0);
83	}
84
85	STATIC bool is_letter(mp_lexer_t *lex) {
86	return unichar_isalpha(lex->chr0);
87	}
88
89	STATIC bool is_digit(mp_lexer_t *lex) {
90	return unichar_isdigit(lex->chr0);
91	}
92
93	STATIC bool is_following_digit(mp_lexer_t *lex) {
94	return unichar_isdigit(lex->chr1);
95	}
96
97	STATIC bool is_following_base_char(mp_lexer_t *lex) {
98	const unichar chr1 = lex->chr1 \| `0x20`;
99	return chr1 == `'b'` \|\| chr1 == `'o'` \|\| chr1 == `'x'`;
100	}
101
102	STATIC bool is_following_odigit(mp_lexer_t *lex) {
103	return lex->chr1 >= `'0'` && lex->chr1 <= `'7'`;
104	}
105
106	STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
107	return is_char_or(lex, `'\''`, `'\"'`)
108	\|\| (is_char_or3(lex, `'r'`, `'u'`, `'b'`) && is_char_following_or(lex, `'\''`, `'\"'`))
109	\|\| ((is_char_and(lex, `'r'`, `'b'`) \|\| is_char_and(lex, `'b'`, `'r'`))
110	&& is_char_following_following_or(lex, `'\''`, `'\"'`));
111	}
112
113	// to easily parse utf-8 identifiers we allow any raw byte with high bit set
114	STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
115	return is_letter(lex) \|\| lex->chr0 == `'_'` \|\| lex->chr0 >= `0x80`;
116	}
117
118	STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
119	return is_head_of_identifier(lex) \|\| is_digit(lex);
120	}
121
122	STATIC void next_char(mp_lexer_t *lex) {
123	if (lex->chr0 == `'\n'`) {
124	// a new line
125	++lex->line;
126	lex->column = `1`;
127	} else if (lex->chr0 == `'\t'`) {
128	// a tab
129	lex->column = (((lex->column - `1` + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + `1`;
130	} else {
131	// a character worth one column
132	++lex->column;
133	}
134
135	lex->chr0 = lex->chr1;
136	lex->chr1 = lex->chr2;
137	lex->chr2 = lex->reader.readbyte(lex->reader.data);
138
139	if (lex->chr1 == `'\r'`) {
140	// CR is a new line, converted to LF
141	lex->chr1 = `'\n'`;
142	if (lex->chr2 == `'\n'`) {
143	// CR LF is a single new line, throw out the extra LF
144	lex->chr2 = lex->reader.readbyte(lex->reader.data);
145	}
146	}
147
148	// check if we need to insert a newline at end of file
149	if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != `'\n'`) {
150	lex->chr2 = `'\n'`;
151	}
152	}
153
154	STATIC void indent_push(mp_lexer_t *lex, size_t indent) {
155	if (lex->num_indent_level >= lex->alloc_indent_level) {
156	lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
157	lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
158	}
159	lex->indent_level[lex->num_indent_level++] = indent;
160	}
161
162	STATIC size_t indent_top(mp_lexer_t *lex) {
163	return lex->indent_level[lex->num_indent_level - `1`];
164	}
165
166	STATIC void indent_pop(mp_lexer_t *lex) {
167	lex->num_indent_level -= `1`;
168	}
169
170	// some tricky operator encoding:
171	// <op> = begin with <op>, if this opchar matches then begin here
172	// e<op> = end with <op>, if this opchar matches then end
173	// c<op> = continue with <op>, if this opchar matches then continue matching
174	// this means if the start of two ops are the same then they are equal til the last char
175
176	STATIC const char *const tok_enc =
177	"()[]{},;~" // singles
178	":e=" // : :=
179	"<e=c<e=" // < <= << <<=
180	">e=c>e=" // > >= >> >>=
181	"e=ce=" // = * *=
182	"+e=" // + +=
183	"-e=e>" // - -= ->
184	"&e=" // & &=
185	"\|e=" // \| \|=
186	"/e=c/e=" // / /= // //=
187	"%e=" // % %=
188	"^e=" // ^ ^=
189	"@e=" // @ @=
190	"=e=" // = ==
191	"!."; // start of special cases: != . ...
192
193	// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
194	STATIC const uint8_t tok_enc_kind[] = {
195	MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
196	MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
197	MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
198	MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_OP_TILDE,
199
200	MP_TOKEN_DEL_COLON, MP_TOKEN_OP_ASSIGN,
201	MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
202	MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
203	MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
204	MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
205	MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
206	MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
207	MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
208	MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
209	MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
210	MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
211	MP_TOKEN_OP_AT, MP_TOKEN_DEL_AT_EQUAL,
212	MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
213	};
214
215	// must have the same order as enum in lexer.h
216	// must be sorted according to strcmp
217	STATIC const char *const tok_kw[] = {
218	"False",
219	"None",
220	"True",
221	"__debug__",
222	"and",
223	"as",
224	"assert",
225	#if MICROPY_PY_ASYNC_AWAIT
226	"async",
227	"await",
228	#endif
229	"break",
230	"class",
231	"continue",
232	"def",
233	"del",
234	"elif",
235	"else",
236	"except",
237	"finally",
238	"for",
239	"from",
240	"global",
241	"if",
242	"import",
243	"in",
244	"is",
245	"lambda",
246	"nonlocal",
247	"not",
248	"or",
249	"pass",
250	"raise",
251	"return",
252	"try",
253	"while",
254	"with",
255	"yield",
256	};
257
258	// This is called with CUR_CHAR() before first hex digit, and should return with
259	// it pointing to last hex digit
260	// num_digits must be greater than zero
261	STATIC bool get_hex(mp_lexer_t lex, size_t num_digits, mp_uint_t result) {
262	mp_uint_t num = `0`;
263	while (num_digits-- != `0`) {
264	next_char(lex);
265	unichar c = CUR_CHAR(lex);
266	if (!unichar_isxdigit(c)) {
267	return false;
268	}
269	num = (num << `4`) + unichar_xdigit_value(c);
270	}
271	*result = num;
272	return true;
273	}
274
275	STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
276	// get first quoting character
277	char quote_char = `'\''`;
278	if (is_char(lex, `'\"'`)) {
279	quote_char = `'\"'`;
280	}
281	next_char(lex);
282
283	// work out if it's a single or triple quoted literal
284	size_t num_quotes;
285	if (is_char_and(lex, quote_char, quote_char)) {
286	// triple quotes
287	next_char(lex);
288	next_char(lex);
289	num_quotes = `3`;
290	} else {
291	// single quotes
292	num_quotes = `1`;
293	}
294
295	size_t n_closing = `0`;
296	while (!is_end(lex) && (num_quotes > `1` \|\| !is_char(lex, `'\n'`)) && n_closing < num_quotes) {
297	if (is_char(lex, quote_char)) {
298	n_closing += `1`;
299	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
300	} else {
301	n_closing = `0`;
302	if (is_char(lex, `'\\'`)) {
303	next_char(lex);
304	unichar c = CUR_CHAR(lex);
305	if (is_raw) {
306	// raw strings allow escaping of quotes, but the backslash is also emitted
307	vstr_add_char(&lex->vstr, `'\\'`);
308	} else {
309	switch (c) {
310	// note: "c" can never be MP_LEXER_EOF because next_char
311	// always inserts a newline at the end of the input stream
312	case `'\n'`:
313	c = MP_LEXER_EOF;
314	break; // backslash escape the newline, just ignore it
315	case `'\\'`:
316	break;
317	case `'\''`:
318	break;
319	case `'"'`:
320	break;
321	case `'a'`:
322	c = `0x07`;
323	break;
324	case `'b'`:
325	c = `0x08`;
326	break;
327	case `'t'`:
328	c = `0x09`;
329	break;
330	case `'n'`:
331	c = `0x0a`;
332	break;
333	case `'v'`:
334	c = `0x0b`;
335	break;
336	case `'f'`:
337	c = `0x0c`;
338	break;
339	case `'r'`:
340	c = `0x0d`;
341	break;
342	case `'u'`:
343	case `'U'`:
344	if (lex->tok_kind == MP_TOKEN_BYTES) {
345	// b'\u1234' == b'\\u1234'
346	vstr_add_char(&lex->vstr, `'\\'`);
347	break;
348	}
349	// Otherwise fall through.
350	MP_FALLTHROUGH
351	case `'x'`: {
352	mp_uint_t num = `0`;
353	if (!get_hex(lex, (c == `'x'` ? `2` : c == `'u'` ? `4` : `8`), &num)) {
354	// not enough hex chars for escape sequence
355	lex->tok_kind = MP_TOKEN_INVALID;
356	}
357	c = num;
358	break;
359	}
360	case `'N'`:
361	// Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
362	// entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
363	// 3MB of text; even gzip-compressed and with minimal structure, it'll take
364	// roughly half a meg of storage. This form of Unicode escape may be added
365	// later on, but it's definitely not a priority right now. -- CJA 20140607
366	mp_raise_NotImplementedError(MP_ERROR_TEXT("unicode name escapes"));
367	break;
368	default:
369	if (c >= `'0'` && c <= `'7'`) {
370	// Octal sequence, 1-3 chars
371	size_t digits = `3`;
372	mp_uint_t num = c - `'0'`;
373	while (is_following_odigit(lex) && --digits != `0`) {
374	next_char(lex);
375	num = num * `8` + (CUR_CHAR(lex) - `'0'`);
376	}
377	c = num;
378	} else {
379	// unrecognised escape character; CPython lets this through verbatim as '\' and then the character
380	vstr_add_char(&lex->vstr, `'\\'`);
381	}
382	break;
383	}
384	}
385	if (c != MP_LEXER_EOF) {
386	if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
387	if (c < `0x110000` && lex->tok_kind == MP_TOKEN_STRING) {
388	vstr_add_char(&lex->vstr, c);
389	} else if (c < `0x100` && lex->tok_kind == MP_TOKEN_BYTES) {
390	vstr_add_byte(&lex->vstr, c);
391	} else {
392	// unicode character out of range
393	// this raises a generic SyntaxError; could provide more info
394	lex->tok_kind = MP_TOKEN_INVALID;
395	}
396	} else {
397	// without unicode everything is just added as an 8-bit byte
398	if (c < `0x100`) {
399	vstr_add_byte(&lex->vstr, c);
400	} else {
401	// 8-bit character out of range
402	// this raises a generic SyntaxError; could provide more info
403	lex->tok_kind = MP_TOKEN_INVALID;
404	}
405	}
406	}
407	} else {
408	// Add the "character" as a byte so that we remain 8-bit clean.
409	// This way, strings are parsed correctly whether or not they contain utf-8 chars.
410	vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
411	}
412	}
413	next_char(lex);
414	}
415
416	// check we got the required end quotes
417	if (n_closing < num_quotes) {
418	lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
419	}
420
421	// cut off the end quotes from the token text
422	vstr_cut_tail_bytes(&lex->vstr, n_closing);
423	}
424
425	STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
426	bool had_physical_newline = false;
427	while (!is_end(lex)) {
428	if (is_physical_newline(lex)) {
429	if (stop_at_newline && lex->nested_bracket_level == `0`) {
430	break;
431	}
432	had_physical_newline = true;
433	next_char(lex);
434	} else if (is_whitespace(lex)) {
435	next_char(lex);
436	} else if (is_char(lex, `'#'`)) {
437	next_char(lex);
438	while (!is_end(lex) && !is_physical_newline(lex)) {
439	next_char(lex);
440	}
441	// had_physical_newline will be set on next loop
442	} else if (is_char_and(lex, `'\\'`, `'\n'`)) {
443	// line-continuation, so don't set had_physical_newline
444	next_char(lex);
445	next_char(lex);
446	} else {
447	break;
448	}
449	}
450	return had_physical_newline;
451	}
452
453	void mp_lexer_to_next(mp_lexer_t *lex) {
454	// start new token text
455	vstr_reset(&lex->vstr);
456
457	// skip white space and comments
458	bool had_physical_newline = skip_whitespace(lex, false);
459
460	// set token source information
461	lex->tok_line = lex->line;
462	lex->tok_column = lex->column;
463
464	if (lex->emit_dent < `0`) {
465	lex->tok_kind = MP_TOKEN_DEDENT;
466	lex->emit_dent += `1`;
467
468	} else if (lex->emit_dent > `0`) {
469	lex->tok_kind = MP_TOKEN_INDENT;
470	lex->emit_dent -= `1`;
471
472	} else if (had_physical_newline && lex->nested_bracket_level == `0`) {
473	lex->tok_kind = MP_TOKEN_NEWLINE;
474
475	size_t num_spaces = lex->column - `1`;
476	if (num_spaces == indent_top(lex)) {
477	} else if (num_spaces > indent_top(lex)) {
478	indent_push(lex, num_spaces);
479	lex->emit_dent += `1`;
480	} else {
481	while (num_spaces < indent_top(lex)) {
482	indent_pop(lex);
483	lex->emit_dent -= `1`;
484	}
485	if (num_spaces != indent_top(lex)) {
486	lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
487	}
488	}
489
490	} else if (is_end(lex)) {
491	lex->tok_kind = MP_TOKEN_END;
492
493	} else if (is_string_or_bytes(lex)) {
494	// a string or bytes literal
495
496	// Python requires adjacent string/bytes literals to be automatically
497	// concatenated. We do it here in the tokeniser to make efficient use of RAM,
498	// because then the lexer's vstr can be used to accumulate the string literal,
499	// in contrast to creating a parse tree of strings and then joining them later
500	// in the compiler. It's also more compact in code size to do it here.
501
502	// MP_TOKEN_END is used to indicate that this is the first string token
503	lex->tok_kind = MP_TOKEN_END;
504
505	// Loop to accumulate string/bytes literals
506	do {
507	// parse type codes
508	bool is_raw = false;
509	mp_token_kind_t kind = MP_TOKEN_STRING;
510	int n_char = `0`;
511	if (is_char(lex, `'u'`)) {
512	n_char = `1`;
513	} else if (is_char(lex, `'b'`)) {
514	kind = MP_TOKEN_BYTES;
515	n_char = `1`;
516	if (is_char_following(lex, `'r'`)) {
517	is_raw = true;
518	n_char = `2`;
519	}
520	} else if (is_char(lex, `'r'`)) {
521	is_raw = true;
522	n_char = `1`;
523	if (is_char_following(lex, `'b'`)) {
524	kind = MP_TOKEN_BYTES;
525	n_char = `2`;
526	}
527	}
528
529	// Set or check token kind
530	if (lex->tok_kind == MP_TOKEN_END) {
531	lex->tok_kind = kind;
532	} else if (lex->tok_kind != kind) {
533	// Can't concatenate string with bytes
534	break;
535	}
536
537	// Skip any type code characters
538	if (n_char != `0`) {
539	next_char(lex);
540	if (n_char == `2`) {
541	next_char(lex);
542	}
543	}
544
545	// Parse the literal
546	parse_string_literal(lex, is_raw);
547
548	// Skip whitespace so we can check if there's another string following
549	skip_whitespace(lex, true);
550
551	} while (is_string_or_bytes(lex));
552
553	} else if (is_head_of_identifier(lex)) {
554	lex->tok_kind = MP_TOKEN_NAME;
555
556	// get first char (add as byte to remain 8-bit clean and support utf-8)
557	vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
558	next_char(lex);
559
560	// get tail chars
561	while (!is_end(lex) && is_tail_of_identifier(lex)) {
562	vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
563	next_char(lex);
564	}
565
566	// Check if the name is a keyword.
567	// We also check for __debug__ here and convert it to its value. This is
568	// so the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
569	// need to check for this special token in many places in the compiler.
570	const char *s = vstr_null_terminated_str(&lex->vstr);
571	for (size_t i = `0`; i < MP_ARRAY_SIZE(tok_kw); i++) {
572	int cmp = strcmp(s, tok_kw[i]);
573	if (cmp == `0`) {
574	lex->tok_kind = MP_TOKEN_KW_FALSE + i;
575	if (lex->tok_kind == MP_TOKEN_KW___DEBUG__) {
576	lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == `0` ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
577	}
578	break;
579	} else if (cmp < `0`) {
580	// Table is sorted and comparison was less-than, so stop searching
581	break;
582	}
583	}
584
585	} else if (is_digit(lex) \|\| (is_char(lex, `'.'`) && is_following_digit(lex))) {
586	bool forced_integer = false;
587	if (is_char(lex, `'.'`)) {
588	lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
589	} else {
590	lex->tok_kind = MP_TOKEN_INTEGER;
591	if (is_char(lex, `'0'`) && is_following_base_char(lex)) {
592	forced_integer = true;
593	}
594	}
595
596	// get first char
597	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
598	next_char(lex);
599
600	// get tail chars
601	while (!is_end(lex)) {
602	if (!forced_integer && is_char_or(lex, `'e'`, `'E'`)) {
603	lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
604	vstr_add_char(&lex->vstr, `'e'`);
605	next_char(lex);
606	if (is_char(lex, `'+'`) \|\| is_char(lex, `'-'`)) {
607	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
608	next_char(lex);
609	}
610	} else if (is_letter(lex) \|\| is_digit(lex) \|\| is_char(lex, `'.'`)) {
611	if (is_char_or3(lex, `'.'`, `'j'`, `'J'`)) {
612	lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
613	}
614	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
615	next_char(lex);
616	} else if (is_char(lex, `'_'`)) {
617	next_char(lex);
618	} else {
619	break;
620	}
621	}
622
623	} else {
624	// search for encoded delimiter or operator
625
626	const char *t = tok_enc;
627	size_t tok_enc_index = `0`;
628	for (; t != `0` && !is_char(lex, t); t += `1`) {
629	if (t == `'e'` \|\| t == `'c'`) {
630	t += `1`;
631	}
632	tok_enc_index += `1`;
633	}
634
635	next_char(lex);
636
637	if (*t == `0`) {
638	// didn't match any delimiter or operator characters
639	lex->tok_kind = MP_TOKEN_INVALID;
640
641	} else if (*t == `'!'`) {
642	// "!=" is a special case because "!" is not a valid operator
643	if (is_char(lex, `'='`)) {
644	next_char(lex);
645	lex->tok_kind = MP_TOKEN_OP_NOT_EQUAL;
646	} else {
647	lex->tok_kind = MP_TOKEN_INVALID;
648	}
649
650	} else if (*t == `'.'`) {
651	// "." and "..." are special cases because ".." is not a valid operator
652	if (is_char_and(lex, `'.'`, `'.'`)) {
653	next_char(lex);
654	next_char(lex);
655	lex->tok_kind = MP_TOKEN_ELLIPSIS;
656	} else {
657	lex->tok_kind = MP_TOKEN_DEL_PERIOD;
658	}
659
660	} else {
661	// matched a delimiter or operator character
662
663	// get the maximum characters for a valid token
664	t += `1`;
665	size_t t_index = tok_enc_index;
666	while (t == `'c'` \|\| t == `'e'`) {
667	t_index += `1`;
668	if (is_char(lex, t[`1`])) {
669	next_char(lex);
670	tok_enc_index = t_index;
671	if (*t == `'e'`) {
672	break;
673	}
674	} else if (*t == `'c'`) {
675	break;
676	}
677	t += `2`;
678	}
679
680	// set token kind
681	lex->tok_kind = tok_enc_kind[tok_enc_index];
682
683	// compute bracket level for implicit line joining
684	if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN \|\| lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN \|\| lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
685	lex->nested_bracket_level += `1`;
686	} else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE \|\| lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE \|\| lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
687	lex->nested_bracket_level -= `1`;
688	}
689	}
690	}
691	}
692
693	mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
694	mp_lexer_t *lex = m_new_obj(mp_lexer_t);
695
696	lex->source_name = src_name;
697	lex->reader = reader;
698	lex->line = `1`;
699	lex->column = (size_t)-`2`; // account for 3 dummy bytes
700	lex->emit_dent = `0`;
701	lex->nested_bracket_level = `0`;
702	lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
703	lex->num_indent_level = `1`;
704	lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
705	vstr_init(&lex->vstr, `32`);
706
707	// store sentinel for first indentation level
708	lex->indent_level[`0`] = `0`;
709
710	// load lexer with start of file, advancing lex->column to 1
711	// start with dummy bytes and use next_char() for proper EOL/EOF handling
712	lex->chr0 = lex->chr1 = lex->chr2 = `0`;
713	next_char(lex);
714	next_char(lex);
715	next_char(lex);
716
717	// preload first token
718	mp_lexer_to_next(lex);
719
720	// Check that the first token is in the first column. If it's not then we
721	// convert the token kind to INDENT so that the parser gives a syntax error.
722	if (lex->tok_column != `1`) {
723	lex->tok_kind = MP_TOKEN_INDENT;
724	}
725
726	return lex;
727	}
728
729	mp_lexer_t mp_lexer_new_from_str_len(qstr src_name, const* char *str, size_t len, size_t free_len) {
730	mp_reader_t reader;
731	mp_reader_new_mem(&reader, (const byte *)str, len, free_len);
732	return mp_lexer_new(src_name, reader);
733	}
734
735	#if MICROPY_READER_POSIX \|\| MICROPY_READER_VFS
736
737	mp_lexer_t mp_lexer_new_from_file(const* char *filename) {
738	mp_reader_t reader;
739	mp_reader_new_file(&reader, filename);
740	return mp_lexer_new(qstr_from_str(filename), reader);
741	}
742
743	#if MICROPY_HELPER_LEXER_UNIX
744
745	mp_lexer_t mp_lexer_new_from_fd(qstr filename, int* fd, bool close_fd) {
746	mp_reader_t reader;
747	mp_reader_new_file_from_fd(&reader, fd, close_fd);
748	return mp_lexer_new(filename, reader);
749	}
750
751	#endif
752
753	#endif
754
755	void mp_lexer_free(mp_lexer_t *lex) {
756	if (lex) {
757	lex->reader.close(lex->reader.data);
758	vstr_clear(&lex->vstr);
759	m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
760	m_del_obj(mp_lexer_t, lex);
761	}
762	}
763
764	#if 0
765	// This function is used to print the current token and should only be
766	// needed to debug the lexer, so it's not available via a config option.
767	void mp_lexer_show_token(const mp_lexer_t *lex) {
768	printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
769	if (lex->vstr.len > `0`) {
770	const byte i = (const* byte *)lex->vstr.buf;
771	const byte j = (const* byte *)i + lex->vstr.len;
772	printf(" ");
773	while (i < j) {
774	unichar c = utf8_get_char(i);
775	i = utf8_next_char(i);
776	if (unichar_isprint(c)) {
777	printf("%c", (int)c);
778	} else {
779	printf("?");
780	}
781	}
782	}
783	printf("\n");
784	}
785	#endif
786
787	#endif // MICROPY_ENABLE_COMPILER
788

Browse the source code of MicroPython/py/lexer.c