1/*
2** Lexical analyzer.
3** Copyright (C) 2005-2014 Mike Pall. See Copyright Notice in luajit.h
4**
5** Major portions taken verbatim or adapted from the Lua interpreter.
6** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
7*/
8
9#define lj_lex_c
10#define LUA_CORE
11
12#include "lj_obj.h"
13#include "lj_gc.h"
14#include "lj_err.h"
15#include "lj_str.h"
16#if LJ_HASFFI
17#include "lj_tab.h"
18#include "lj_ctype.h"
19#include "lj_cdata.h"
20#include "lualib.h"
21#endif
22#include "lj_state.h"
23#include "lj_lex.h"
24#include "lj_parse.h"
25#include "lj_char.h"
26#include "lj_strscan.h"
27
28/* Lua lexer token names. */
29static const char *const tokennames[] = {
30#define TKSTR1(name) #name,
31#define TKSTR2(name, sym) #sym,
32TKDEF(TKSTR1, TKSTR2)
33#undef TKSTR1
34#undef TKSTR2
35 NULL
36};
37
38/* -- Buffer handling ----------------------------------------------------- */
39
40#define char2int(c) ((int)(uint8_t)(c))
41#define next(ls) \
42 (ls->current = (ls->n--) > 0 ? char2int(*ls->p++) : fillbuf(ls))
43#define save_and_next(ls) (save(ls, ls->current), next(ls))
44#define currIsNewline(ls) (ls->current == '\n' || ls->current == '\r')
45#define END_OF_STREAM (-1)
46
47static int fillbuf(LexState *ls)
48{
49 size_t sz;
50 const char *buf = ls->rfunc(ls->L, ls->rdata, &sz);
51 if (buf == NULL || sz == 0) return END_OF_STREAM;
52 ls->n = (MSize)sz - 1;
53 ls->p = buf;
54 return char2int(*(ls->p++));
55}
56
57static LJ_NOINLINE void save_grow(LexState *ls, int c)
58{
59 MSize newsize;
60 if (ls->sb.sz >= LJ_MAX_STR/2)
61 lj_lex_error(ls, 0, LJ_ERR_XELEM);
62 newsize = ls->sb.sz * 2;
63 lj_str_resizebuf(ls->L, &ls->sb, newsize);
64 ls->sb.buf[ls->sb.n++] = (char)c;
65}
66
67static LJ_AINLINE void save(LexState *ls, int c)
68{
69 if (LJ_UNLIKELY(ls->sb.n + 1 > ls->sb.sz))
70 save_grow(ls, c);
71 else
72 ls->sb.buf[ls->sb.n++] = (char)c;
73}
74
75static void inclinenumber(LexState *ls)
76{
77 int old = ls->current;
78 lua_assert(currIsNewline(ls));
79 next(ls); /* skip `\n' or `\r' */
80 if (currIsNewline(ls) && ls->current != old)
81 next(ls); /* skip `\n\r' or `\r\n' */
82 if (++ls->linenumber >= LJ_MAX_LINE)
83 lj_lex_error(ls, ls->token, LJ_ERR_XLINES);
84}
85
86/* -- Scanner for terminals ----------------------------------------------- */
87
88/* Parse a number literal. */
89static void lex_number(LexState *ls, TValue *tv)
90{
91 StrScanFmt fmt;
92 int c, xp = 'e';
93 lua_assert(lj_char_isdigit(ls->current));
94 if ((c = ls->current) == '0') {
95 save_and_next(ls);
96 if ((ls->current | 0x20) == 'x') xp = 'p';
97 }
98 while (lj_char_isident(ls->current) || ls->current == '.' ||
99 ((ls->current == '-' || ls->current == '+') && (c | 0x20) == xp)) {
100 c = ls->current;
101 save_and_next(ls);
102 }
103 save(ls, '\0');
104 fmt = lj_strscan_scan((const uint8_t *)ls->sb.buf, tv,
105 (LJ_DUALNUM ? STRSCAN_OPT_TOINT : STRSCAN_OPT_TONUM) |
106 (LJ_HASFFI ? (STRSCAN_OPT_LL|STRSCAN_OPT_IMAG) : 0));
107 if (LJ_DUALNUM && fmt == STRSCAN_INT) {
108 setitype(tv, LJ_TISNUM);
109 } else if (fmt == STRSCAN_NUM) {
110 /* Already in correct format. */
111#if LJ_HASFFI
112 } else if (fmt != STRSCAN_ERROR) {
113 lua_State *L = ls->L;
114 GCcdata *cd;
115 lua_assert(fmt == STRSCAN_I64 || fmt == STRSCAN_U64 || fmt == STRSCAN_IMAG);
116 if (!ctype_ctsG(G(L))) {
117 ptrdiff_t oldtop = savestack(L, L->top);
118 luaopen_ffi(L); /* Load FFI library on-demand. */
119 L->top = restorestack(L, oldtop);
120 }
121 if (fmt == STRSCAN_IMAG) {
122 cd = lj_cdata_new_(L, CTID_COMPLEX_DOUBLE, 2*sizeof(double));
123 ((double *)cdataptr(cd))[0] = 0;
124 ((double *)cdataptr(cd))[1] = numV(tv);
125 } else {
126 cd = lj_cdata_new_(L, fmt==STRSCAN_I64 ? CTID_INT64 : CTID_UINT64, 8);
127 *(uint64_t *)cdataptr(cd) = tv->u64;
128 }
129 lj_parse_keepcdata(ls, tv, cd);
130#endif
131 } else {
132 lua_assert(fmt == STRSCAN_ERROR);
133 lj_lex_error(ls, TK_number, LJ_ERR_XNUMBER);
134 }
135}
136
137static int skip_sep(LexState *ls)
138{
139 int count = 0;
140 int s = ls->current;
141 lua_assert(s == '[' || s == ']');
142 save_and_next(ls);
143 while (ls->current == '=') {
144 save_and_next(ls);
145 count++;
146 }
147 return (ls->current == s) ? count : (-count) - 1;
148}
149
150static void read_long_string(LexState *ls, TValue *tv, int sep)
151{
152 save_and_next(ls); /* skip 2nd `[' */
153 if (currIsNewline(ls)) /* string starts with a newline? */
154 inclinenumber(ls); /* skip it */
155 for (;;) {
156 switch (ls->current) {
157 case END_OF_STREAM:
158 lj_lex_error(ls, TK_eof, tv ? LJ_ERR_XLSTR : LJ_ERR_XLCOM);
159 break;
160 case ']':
161 if (skip_sep(ls) == sep) {
162 save_and_next(ls); /* skip 2nd `]' */
163 goto endloop;
164 }
165 break;
166 case '\n':
167 case '\r':
168 save(ls, '\n');
169 inclinenumber(ls);
170 if (!tv) lj_str_resetbuf(&ls->sb); /* avoid wasting space */
171 break;
172 default:
173 if (tv) save_and_next(ls);
174 else next(ls);
175 break;
176 }
177 } endloop:
178 if (tv) {
179 GCstr *str = lj_parse_keepstr(ls, ls->sb.buf + (2 + (MSize)sep),
180 ls->sb.n - 2*(2 + (MSize)sep));
181 setstrV(ls->L, tv, str);
182 }
183}
184
185static void read_string(LexState *ls, int delim, TValue *tv)
186{
187 save_and_next(ls);
188 while (ls->current != delim) {
189 switch (ls->current) {
190 case END_OF_STREAM:
191 lj_lex_error(ls, TK_eof, LJ_ERR_XSTR);
192 continue;
193 case '\n':
194 case '\r':
195 lj_lex_error(ls, TK_string, LJ_ERR_XSTR);
196 continue;
197 case '\\': {
198 int c = next(ls); /* Skip the '\\'. */
199 switch (c) {
200 case 'a': c = '\a'; break;
201 case 'b': c = '\b'; break;
202 case 'f': c = '\f'; break;
203 case 'n': c = '\n'; break;
204 case 'r': c = '\r'; break;
205 case 't': c = '\t'; break;
206 case 'v': c = '\v'; break;
207 case 'x': /* Hexadecimal escape '\xXX'. */
208 c = (next(ls) & 15u) << 4;
209 if (!lj_char_isdigit(ls->current)) {
210 if (!lj_char_isxdigit(ls->current)) goto err_xesc;
211 c += 9 << 4;
212 }
213 c += (next(ls) & 15u);
214 if (!lj_char_isdigit(ls->current)) {
215 if (!lj_char_isxdigit(ls->current)) goto err_xesc;
216 c += 9;
217 }
218 break;
219 case 'z': /* Skip whitespace. */
220 next(ls);
221 while (lj_char_isspace(ls->current))
222 if (currIsNewline(ls)) inclinenumber(ls); else next(ls);
223 continue;
224 case '\n': case '\r': save(ls, '\n'); inclinenumber(ls); continue;
225 case '\\': case '\"': case '\'': break;
226 case END_OF_STREAM: continue;
227 default:
228 if (!lj_char_isdigit(c))
229 goto err_xesc;
230 c -= '0'; /* Decimal escape '\ddd'. */
231 if (lj_char_isdigit(next(ls))) {
232 c = c*10 + (ls->current - '0');
233 if (lj_char_isdigit(next(ls))) {
234 c = c*10 + (ls->current - '0');
235 if (c > 255) {
236 err_xesc:
237 lj_lex_error(ls, TK_string, LJ_ERR_XESC);
238 }
239 next(ls);
240 }
241 }
242 save(ls, c);
243 continue;
244 }
245 save(ls, c);
246 next(ls);
247 continue;
248 }
249 default:
250 save_and_next(ls);
251 break;
252 }
253 }
254 save_and_next(ls); /* skip delimiter */
255 setstrV(ls->L, tv, lj_parse_keepstr(ls, ls->sb.buf + 1, ls->sb.n - 2));
256}
257
258/* -- Main lexical scanner ------------------------------------------------ */
259
260static int llex(LexState *ls, TValue *tv)
261{
262 lj_str_resetbuf(&ls->sb);
263 for (;;) {
264 if (lj_char_isident(ls->current)) {
265 GCstr *s;
266 if (lj_char_isdigit(ls->current)) { /* Numeric literal. */
267 lex_number(ls, tv);
268 return TK_number;
269 }
270 /* Identifier or reserved word. */
271 do {
272 save_and_next(ls);
273 } while (lj_char_isident(ls->current));
274 s = lj_parse_keepstr(ls, ls->sb.buf, ls->sb.n);
275 setstrV(ls->L, tv, s);
276 if (s->reserved > 0) /* Reserved word? */
277 return TK_OFS + s->reserved;
278 return TK_name;
279 }
280 switch (ls->current) {
281 case '\n':
282 case '\r':
283 inclinenumber(ls);
284 continue;
285 case ' ':
286 case '\t':
287 case '\v':
288 case '\f':
289 next(ls);
290 continue;
291 case '-':
292 next(ls);
293 if (ls->current != '-') return '-';
294 /* else is a comment */
295 next(ls);
296 if (ls->current == '[') {
297 int sep = skip_sep(ls);
298 lj_str_resetbuf(&ls->sb); /* `skip_sep' may dirty the buffer */
299 if (sep >= 0) {
300 read_long_string(ls, NULL, sep); /* long comment */
301 lj_str_resetbuf(&ls->sb);
302 continue;
303 }
304 }
305 /* else short comment */
306 while (!currIsNewline(ls) && ls->current != END_OF_STREAM)
307 next(ls);
308 continue;
309 case '[': {
310 int sep = skip_sep(ls);
311 if (sep >= 0) {
312 read_long_string(ls, tv, sep);
313 return TK_string;
314 } else if (sep == -1) {
315 return '[';
316 } else {
317 lj_lex_error(ls, TK_string, LJ_ERR_XLDELIM);
318 continue;
319 }
320 }
321 case '=':
322 next(ls);
323 if (ls->current != '=') return '='; else { next(ls); return TK_eq; }
324 case '<':
325 next(ls);
326 if (ls->current != '=') return '<'; else { next(ls); return TK_le; }
327 case '>':
328 next(ls);
329 if (ls->current != '=') return '>'; else { next(ls); return TK_ge; }
330 case '~':
331 next(ls);
332 if (ls->current != '=') return '~'; else { next(ls); return TK_ne; }
333 case ':':
334 next(ls);
335 if (ls->current != ':') return ':'; else { next(ls); return TK_label; }
336 case '"':
337 case '\'':
338 read_string(ls, ls->current, tv);
339 return TK_string;
340 case '.':
341 save_and_next(ls);
342 if (ls->current == '.') {
343 next(ls);
344 if (ls->current == '.') {
345 next(ls);
346 return TK_dots; /* ... */
347 }
348 return TK_concat; /* .. */
349 } else if (!lj_char_isdigit(ls->current)) {
350 return '.';
351 } else {
352 lex_number(ls, tv);
353 return TK_number;
354 }
355 case END_OF_STREAM:
356 return TK_eof;
357 default: {
358 int c = ls->current;
359 next(ls);
360 return c; /* Single-char tokens (+ - / ...). */
361 }
362 }
363 }
364}
365
366/* -- Lexer API ----------------------------------------------------------- */
367
368/* Setup lexer state. */
369int lj_lex_setup(lua_State *L, LexState *ls)
370{
371 int header = 0;
372 ls->L = L;
373 ls->fs = NULL;
374 ls->n = 0;
375 ls->p = NULL;
376 ls->vstack = NULL;
377 ls->sizevstack = 0;
378 ls->vtop = 0;
379 ls->bcstack = NULL;
380 ls->sizebcstack = 0;
381 ls->lookahead = TK_eof; /* No look-ahead token. */
382 ls->linenumber = 1;
383 ls->lastline = 1;
384 lj_str_resizebuf(ls->L, &ls->sb, LJ_MIN_SBUF);
385 next(ls); /* Read-ahead first char. */
386 if (ls->current == 0xef && ls->n >= 2 && char2int(ls->p[0]) == 0xbb &&
387 char2int(ls->p[1]) == 0xbf) { /* Skip UTF-8 BOM (if buffered). */
388 ls->n -= 2;
389 ls->p += 2;
390 next(ls);
391 header = 1;
392 }
393 if (ls->current == '#') { /* Skip POSIX #! header line. */
394 do {
395 next(ls);
396 if (ls->current == END_OF_STREAM) return 0;
397 } while (!currIsNewline(ls));
398 inclinenumber(ls);
399 header = 1;
400 }
401 if (ls->current == LUA_SIGNATURE[0]) { /* Bytecode dump. */
402 if (header) {
403 /*
404 ** Loading bytecode with an extra header is disabled for security
405 ** reasons. This may circumvent the usual check for bytecode vs.
406 ** Lua code by looking at the first char. Since this is a potential
407 ** security violation no attempt is made to echo the chunkname either.
408 */
409 setstrV(L, L->top++, lj_err_str(L, LJ_ERR_BCBAD));
410 lj_err_throw(L, LUA_ERRSYNTAX);
411 }
412 return 1;
413 }
414 return 0;
415}
416
417/* Cleanup lexer state. */
418void lj_lex_cleanup(lua_State *L, LexState *ls)
419{
420 global_State *g = G(L);
421 lj_mem_freevec(g, ls->bcstack, ls->sizebcstack, BCInsLine);
422 lj_mem_freevec(g, ls->vstack, ls->sizevstack, VarInfo);
423 lj_str_freebuf(g, &ls->sb);
424}
425
426void lj_lex_next(LexState *ls)
427{
428 ls->lastline = ls->linenumber;
429 if (LJ_LIKELY(ls->lookahead == TK_eof)) { /* No lookahead token? */
430 ls->token = llex(ls, &ls->tokenval); /* Get next token. */
431 } else { /* Otherwise return lookahead token. */
432 ls->token = ls->lookahead;
433 ls->lookahead = TK_eof;
434 ls->tokenval = ls->lookaheadval;
435 }
436}
437
438LexToken lj_lex_lookahead(LexState *ls)
439{
440 lua_assert(ls->lookahead == TK_eof);
441 ls->lookahead = llex(ls, &ls->lookaheadval);
442 return ls->lookahead;
443}
444
445const char *lj_lex_token2str(LexState *ls, LexToken token)
446{
447 if (token > TK_OFS)
448 return tokennames[token-TK_OFS-1];
449 else if (!lj_char_iscntrl(token))
450 return lj_str_pushf(ls->L, "%c", token);
451 else
452 return lj_str_pushf(ls->L, "char(%d)", token);
453}
454
455void lj_lex_error(LexState *ls, LexToken token, ErrMsg em, ...)
456{
457 const char *tok;
458 va_list argp;
459 if (token == 0) {
460 tok = NULL;
461 } else if (token == TK_name || token == TK_string || token == TK_number) {
462 save(ls, '\0');
463 tok = ls->sb.buf;
464 } else {
465 tok = lj_lex_token2str(ls, token);
466 }
467 va_start(argp, em);
468 lj_err_lex(ls->L, ls->chunkname, tok, ls->linenumber, em, argp);
469 va_end(argp);
470}
471
472void lj_lex_init(lua_State *L)
473{
474 uint32_t i;
475 for (i = 0; i < TK_RESERVED; i++) {
476 GCstr *s = lj_str_newz(L, tokennames[i]);
477 fixstring(s); /* Reserved words are never collected. */
478 s->reserved = (uint8_t)(i+1);
479 }
480}
481
482