1 | /* |
2 | * Integration of https://github.com/starwing/luautf8 |
3 | * |
4 | * MIT License |
5 | * |
6 | * Copyright (c) 2018 Xavier Wang |
7 | * |
8 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
9 | * of this software and associated documentation files (the "Software"), to deal |
10 | * in the Software without restriction, including without limitation the rights |
11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
12 | * copies of the Software, and to permit persons to whom the Software is |
13 | * furnished to do so, subject to the following conditions: |
14 | * |
15 | * The above copyright notice and this permission notice shall be included in all |
16 | * copies or substantial portions of the Software. |
17 | * |
18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
24 | * SOFTWARE. |
25 | */ |
26 | |
27 | #include <lua.h> |
28 | #include <lauxlib.h> |
29 | #include <lualib.h> |
30 | |
31 | |
32 | #include <assert.h> |
33 | #include <string.h> |
34 | |
35 | #include "../unidata.h" |
36 | |
37 | /* UTF-8 string operations */ |
38 | |
39 | #define UTF8_BUFFSZ 8 |
40 | #define UTF8_MAX 0x7FFFFFFFu |
41 | #define UTF8_MAXCP 0x10FFFFu |
42 | #define iscont(p) ((*(p) & 0xC0) == 0x80) |
43 | #define CAST(tp,expr) ((tp)(expr)) |
44 | |
45 | #ifndef LUA_QL |
46 | # define LUA_QL(x) "'" x "'" |
47 | #endif |
48 | |
49 | static int utf8_invalid (utfint ch) |
50 | { return (ch > UTF8_MAXCP || (0xD800u <= ch && ch <= 0xDFFFu)); } |
51 | |
52 | static size_t utf8_encode (char *buff, utfint x) { |
53 | int n = 1; /* number of bytes put in buffer (backwards) */ |
54 | lua_assert(x <= UTF8_MAX); |
55 | if (x < 0x80) /* ascii? */ |
56 | buff[UTF8_BUFFSZ - 1] = x & 0x7F; |
57 | else { /* need continuation bytes */ |
58 | utfint mfb = 0x3f; /* maximum that fits in first byte */ |
59 | do { /* add continuation bytes */ |
60 | buff[UTF8_BUFFSZ - (n++)] = 0x80 | (x & 0x3f); |
61 | x >>= 6; /* remove added bits */ |
62 | mfb >>= 1; /* now there is one less bit available in first byte */ |
63 | } while (x > mfb); /* still needs continuation byte? */ |
64 | buff[UTF8_BUFFSZ - n] = ((~mfb << 1) | x) & 0xFF; /* add first byte */ |
65 | } |
66 | return n; |
67 | } |
68 | |
69 | static const char *utf8_decode (const char *s, utfint *val, int strict) { |
70 | static const utfint limits[] = |
71 | {~0u, 0x80u, 0x800u, 0x10000u, 0x200000u, 0x4000000u}; |
72 | unsigned int c = (unsigned char)s[0]; |
73 | utfint res = 0; /* final result */ |
74 | if (c < 0x80) /* ascii? */ |
75 | res = c; |
76 | else { |
77 | int count = 0; /* to count number of continuation bytes */ |
78 | for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */ |
79 | unsigned int cc = (unsigned char)s[++count]; /* read next byte */ |
80 | if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ |
81 | return NULL; /* invalid byte sequence */ |
82 | res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ |
83 | } |
84 | res |= ((utfint)(c & 0x7F) << (count * 5)); /* add first byte */ |
85 | if (count > 5 || res > UTF8_MAX || res < limits[count]) |
86 | return NULL; /* invalid byte sequence */ |
87 | s += count; /* skip continuation bytes read */ |
88 | } |
89 | if (strict) { |
90 | /* check for invalid code points; too large or surrogates */ |
91 | if (res > UTF8_MAXCP || (0xD800u <= res && res <= 0xDFFFu)) |
92 | return NULL; |
93 | } |
94 | if (val) *val = res; |
95 | return s + 1; /* +1 to include first byte */ |
96 | } |
97 | |
98 | static const char *utf8_prev (const char *s, const char *e) { |
99 | while (s < e && iscont(e - 1)) --e; |
100 | return s < e ? e - 1 : s; |
101 | } |
102 | |
103 | static const char *utf8_next (const char *s, const char *e) { |
104 | while (s < e && iscont(s + 1)) ++s; |
105 | return s < e ? s + 1 : e; |
106 | } |
107 | |
108 | static size_t utf8_length (const char *s, const char *e) { |
109 | size_t i; |
110 | for (i = 0; s < e; ++i) |
111 | s = utf8_next(s, e); |
112 | return i; |
113 | } |
114 | |
115 | static const char *utf8_offset (const char *s, const char *e, lua_Integer offset, lua_Integer idx) { |
116 | const char *p = s + offset - 1; |
117 | if (idx >= 0) { |
118 | while (p < e && idx > 0) |
119 | p = utf8_next(p, e), --idx; |
120 | return idx == 0 ? p : NULL; |
121 | } else { |
122 | while (s < p && idx < 0) |
123 | p = utf8_prev(s, p), ++idx; |
124 | return idx == 0 ? p : NULL; |
125 | } |
126 | } |
127 | |
128 | static const char *utf8_relat (const char *s, const char *e, int idx) { |
129 | return idx >= 0 ? |
130 | utf8_offset(s, e, 1, idx - 1) : |
131 | utf8_offset(s, e, e-s+1, idx); |
132 | } |
133 | |
134 | static int utf8_range(const char *s, const char *e, lua_Integer *i, lua_Integer *j) { |
135 | const char *ps = utf8_relat(s, e, CAST(int, *i)); |
136 | const char *pe = utf8_relat(s, e, CAST(int, *j)); |
137 | *i = (ps ? ps : (*i > 0 ? e : s)) - s; |
138 | *j = (pe ? utf8_next(pe, e) : (*j > 0 ? e : s)) - s; |
139 | return *i < *j; |
140 | } |
141 | |
142 | |
143 | /* Unicode character categories */ |
144 | |
145 | #define table_size(t) (sizeof(t)/sizeof((t)[0])) |
146 | |
147 | #define utf8_categories(X) \ |
148 | X('a', alpha) \ |
149 | X('c', cntrl) \ |
150 | X('d', digit) \ |
151 | X('l', lower) \ |
152 | X('p', punct) \ |
153 | X('s', space) \ |
154 | X('t', compose) \ |
155 | X('u', upper) \ |
156 | X('x', xdigit) |
157 | |
158 | #define utf8_converters(X) \ |
159 | X(lower) \ |
160 | X(upper) \ |
161 | X(title) \ |
162 | X(fold) |
163 | |
164 | static int find_in_range (range_table *t, size_t size, utfint ch) { |
165 | size_t begin, end; |
166 | |
167 | begin = 0; |
168 | end = size; |
169 | |
170 | while (begin < end) { |
171 | size_t mid = (begin + end) / 2; |
172 | if (t[mid].last < ch) |
173 | begin = mid + 1; |
174 | else if (t[mid].first > ch) |
175 | end = mid; |
176 | else |
177 | return (ch - t[mid].first) % t[mid].step == 0; |
178 | } |
179 | |
180 | return 0; |
181 | } |
182 | |
183 | static int convert_char (conv_table *t, size_t size, utfint ch) { |
184 | size_t begin, end; |
185 | |
186 | begin = 0; |
187 | end = size; |
188 | |
189 | while (begin < end) { |
190 | size_t mid = (begin + end) / 2; |
191 | if (t[mid].last < ch) |
192 | begin = mid + 1; |
193 | else if (t[mid].first > ch) |
194 | end = mid; |
195 | else if ((ch - t[mid].first) % t[mid].step == 0) |
196 | return ch + t[mid].offset; |
197 | else |
198 | return ch; |
199 | } |
200 | |
201 | return ch; |
202 | } |
203 | |
204 | #define define_category(cls, name) static int utf8_is##name (utfint ch)\ |
205 | { return find_in_range(name##_table, table_size(name##_table), ch); } |
206 | #define define_converter(name) static utfint utf8_to##name (utfint ch) \ |
207 | { return convert_char(to##name##_table, table_size(to##name##_table), ch); } |
208 | utf8_categories(define_category) |
209 | utf8_converters(define_converter) |
210 | #undef define_category |
211 | #undef define_converter |
212 | |
213 | static int utf8_isgraph (utfint ch) { |
214 | if (find_in_range(space_table, table_size(space_table), ch)) |
215 | return 0; |
216 | if (find_in_range(graph_table, table_size(graph_table), ch)) |
217 | return 1; |
218 | if (find_in_range(compose_table, table_size(compose_table), ch)) |
219 | return 1; |
220 | return 0; |
221 | } |
222 | |
223 | static int utf8_isalnum (utfint ch) { |
224 | if (find_in_range(alpha_table, table_size(alpha_table), ch)) |
225 | return 1; |
226 | if (find_in_range(alnum_extend_table, table_size(alnum_extend_table), ch)) |
227 | return 1; |
228 | return 0; |
229 | } |
230 | |
231 | static int utf8_width (utfint ch, int ambi_is_single) { |
232 | if (find_in_range(doublewidth_table, table_size(doublewidth_table), ch)) |
233 | return 2; |
234 | if (find_in_range(ambiwidth_table, table_size(ambiwidth_table), ch)) |
235 | return ambi_is_single ? 1 : 2; |
236 | if (find_in_range(compose_table, table_size(compose_table), ch)) |
237 | return 0; |
238 | if (find_in_range(unprintable_table, table_size(unprintable_table), ch)) |
239 | return 0; |
240 | return 1; |
241 | } |
242 | |
243 | |
244 | /* string module compatible interface */ |
245 | |
246 | static int typeerror (lua_State *L, int idx, const char *tname) |
247 | { return luaL_error(L, "%s expected, got %s" , tname, luaL_typename(L, idx)); } |
248 | |
249 | static const char *check_utf8 (lua_State *L, int idx, const char **end) { |
250 | size_t len; |
251 | const char *s = luaL_checklstring(L, idx, &len); |
252 | if (end) *end = s+len; |
253 | return s; |
254 | } |
255 | |
256 | static const char *to_utf8 (lua_State *L, int idx, const char **end) { |
257 | size_t len; |
258 | const char *s = lua_tolstring(L, idx, &len); |
259 | if (end) *end = s+len; |
260 | return s; |
261 | } |
262 | |
263 | static const char *utf8_safe_decode (lua_State *L, const char *p, utfint *pval) { |
264 | p = utf8_decode(p, pval, 0); |
265 | if (p == NULL) luaL_error(L, "invalid UTF-8 code" ); |
266 | return p; |
267 | } |
268 | |
269 | static void add_utf8char (luaL_Buffer *b, utfint ch) { |
270 | char buff[UTF8_BUFFSZ]; |
271 | size_t n = utf8_encode(buff, ch); |
272 | luaL_addlstring(b, buff+UTF8_BUFFSZ-n, n); |
273 | } |
274 | |
275 | static lua_Integer byte_relat (lua_Integer pos, size_t len) { |
276 | if (pos >= 0) return pos; |
277 | else if (0u - (size_t)pos > len) return 0; |
278 | else return (lua_Integer)len + pos + 1; |
279 | } |
280 | |
281 | static int Lutf8_len (lua_State *L) { |
282 | size_t len, n; |
283 | const char *s = luaL_checklstring(L, 1, &len), *p, *e; |
284 | lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len); |
285 | lua_Integer pose = byte_relat(luaL_optinteger(L, 3, -1), len); |
286 | int lax = lua_toboolean(L, 4); |
287 | luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2, |
288 | "initial position out of string" ); |
289 | luaL_argcheck(L, --pose < (lua_Integer)len, 3, |
290 | "final position out of string" ); |
291 | for (n = 0, p=s+posi, e=s+pose+1; p < e; ++n) { |
292 | if (lax) |
293 | p = utf8_next(p, e); |
294 | else { |
295 | utfint ch; |
296 | const char *np = utf8_decode(p, &ch, !lax); |
297 | if (np == NULL || utf8_invalid(ch)) { |
298 | lua_pushnil(L); |
299 | lua_pushinteger(L, p - s + 1); |
300 | return 2; |
301 | } |
302 | p = np; |
303 | } |
304 | } |
305 | lua_pushinteger(L, n); |
306 | return 1; |
307 | } |
308 | |
309 | static int Lutf8_sub (lua_State *L) { |
310 | const char *e, *s = check_utf8(L, 1, &e); |
311 | lua_Integer posi = luaL_checkinteger(L, 2); |
312 | lua_Integer pose = luaL_optinteger(L, 3, -1); |
313 | if (utf8_range(s, e, &posi, &pose)) |
314 | lua_pushlstring(L, s+posi, pose-posi); |
315 | else |
316 | lua_pushliteral(L, "" ); |
317 | return 1; |
318 | } |
319 | |
320 | static int Lutf8_reverse (lua_State *L) { |
321 | luaL_Buffer b; |
322 | const char *prev, *pprev, *ends, *e, *s = check_utf8(L, 1, &e); |
323 | (void) ends; |
324 | int lax = lua_toboolean(L, 2); |
325 | luaL_buffinit(L, &b); |
326 | if (lax) { |
327 | for (prev = e; s < prev; e = prev) { |
328 | prev = utf8_prev(s, prev); |
329 | luaL_addlstring(&b, prev, e-prev); |
330 | } |
331 | } else { |
332 | for (prev = e; s < prev; prev = pprev) { |
333 | utfint code = 0; |
334 | ends = utf8_safe_decode(L, pprev = utf8_prev(s, prev), &code); |
335 | assert(ends == prev); |
336 | if (utf8_invalid(code)) |
337 | return luaL_error(L, "invalid UTF-8 code" ); |
338 | if (!utf8_iscompose(code)) { |
339 | luaL_addlstring(&b, pprev, e-pprev); |
340 | e = pprev; |
341 | } |
342 | } |
343 | } |
344 | luaL_pushresult(&b); |
345 | return 1; |
346 | } |
347 | |
348 | static int Lutf8_byte (lua_State *L) { |
349 | size_t n = 0; |
350 | const char *e, *s = check_utf8(L, 1, &e); |
351 | lua_Integer posi = luaL_optinteger(L, 2, 1); |
352 | lua_Integer pose = luaL_optinteger(L, 3, posi); |
353 | if (utf8_range(s, e, &posi, &pose)) { |
354 | for (e = s + pose, s = s + posi; s < e; ++n) { |
355 | utfint ch = 0; |
356 | s = utf8_safe_decode(L, s, &ch); |
357 | lua_pushinteger(L, ch); |
358 | } |
359 | } |
360 | return CAST(int, n); |
361 | } |
362 | |
363 | static int Lutf8_codepoint (lua_State *L) { |
364 | const char *e, *s = check_utf8(L, 1, &e); |
365 | size_t len = e-s; |
366 | lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len); |
367 | lua_Integer pose = byte_relat(luaL_optinteger(L, 3, posi), len); |
368 | int lax = lua_toboolean(L, 4); |
369 | int n; |
370 | const char *se; |
371 | luaL_argcheck(L, posi >= 1, 2, "out of range" ); |
372 | luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range" ); |
373 | if (posi > pose) return 0; /* empty interval; return no values */ |
374 | if (pose - posi >= INT_MAX) /* (lua_Integer -> int) overflow? */ |
375 | return luaL_error(L, "string slice too long" ); |
376 | n = (int)(pose - posi + 1); |
377 | luaL_checkstack(L, n, "string slice too long" ); |
378 | n = 0; /* count the number of returns */ |
379 | se = s + pose; /* string end */ |
380 | for (s += posi - 1; s < se;) { |
381 | utfint code = 0; |
382 | s = utf8_safe_decode(L, s, &code); |
383 | if (!lax && utf8_invalid(code)) |
384 | return luaL_error(L, "invalid UTF-8 code" ); |
385 | lua_pushinteger(L, code); |
386 | n++; |
387 | } |
388 | return n; |
389 | } |
390 | |
391 | static int Lutf8_char (lua_State *L) { |
392 | int i, n = lua_gettop(L); /* number of arguments */ |
393 | luaL_Buffer b; |
394 | luaL_buffinit(L, &b); |
395 | for (i = 1; i <= n; ++i) { |
396 | lua_Integer code = luaL_checkinteger(L, i); |
397 | luaL_argcheck(L, code <= UTF8_MAXCP, i, "value out of range" ); |
398 | add_utf8char(&b, CAST(utfint, code)); |
399 | } |
400 | luaL_pushresult(&b); |
401 | return 1; |
402 | } |
403 | |
404 | #define bind_converter(name) \ |
405 | static int Lutf8_##name (lua_State *L) { \ |
406 | int t = lua_type(L, 1); \ |
407 | if (t == LUA_TNUMBER) \ |
408 | lua_pushinteger(L, utf8_to##name(CAST(utfint, lua_tointeger(L, 1)))); \ |
409 | else if (t == LUA_TSTRING) { \ |
410 | luaL_Buffer b; \ |
411 | const char *e, *s = to_utf8(L, 1, &e); \ |
412 | luaL_buffinit(L, &b); \ |
413 | while (s < e) { \ |
414 | utfint ch = 0; \ |
415 | s = utf8_safe_decode(L, s, &ch); \ |
416 | add_utf8char(&b, utf8_to##name(ch)); \ |
417 | } \ |
418 | luaL_pushresult(&b); \ |
419 | } \ |
420 | else return typeerror(L, 1, "number/string"); \ |
421 | return 1; \ |
422 | } |
423 | utf8_converters(bind_converter) |
424 | #undef bind_converter |
425 | |
426 | |
427 | /* unicode extra interface */ |
428 | |
429 | static const char *parse_escape (lua_State *L, const char *s, const char *e, int hex, utfint *pch) { |
430 | utfint code = 0; |
431 | int in_bracket = 0; |
432 | if (*s == '{') ++s, in_bracket = 1; |
433 | for (; s < e; ++s) { |
434 | utfint ch = (unsigned char)*s; |
435 | if (ch >= '0' && ch <= '9') ch = ch - '0'; |
436 | else if (hex && ch >= 'A' && ch <= 'F') ch = 10 + (ch - 'A'); |
437 | else if (hex && ch >= 'a' && ch <= 'f') ch = 10 + (ch - 'a'); |
438 | else if (!in_bracket) break; |
439 | else if (ch == '}') { ++s; break; } |
440 | else luaL_error(L, "invalid escape '%c'" , ch); |
441 | code *= hex ? 16 : 10; |
442 | code += ch; |
443 | } |
444 | *pch = code; |
445 | return s; |
446 | } |
447 | |
448 | static int Lutf8_escape (lua_State *L) { |
449 | const char *e, *s = check_utf8(L, 1, &e); |
450 | luaL_Buffer b; |
451 | luaL_buffinit(L, &b); |
452 | while (s < e) { |
453 | utfint ch = 0; |
454 | s = utf8_safe_decode(L, s, &ch); |
455 | if (ch == '%') { |
456 | int hex = 0; |
457 | switch (*s) { |
458 | case '0': case '1': case '2': case '3': |
459 | case '4': case '5': case '6': case '7': |
460 | case '8': case '9': case '{': |
461 | break; |
462 | case 'x': case 'X': hex = 1; /* fall through */ |
463 | case 'u': case 'U': if (s+1 < e) { ++s; break; } |
464 | /* fall through */ |
465 | default: |
466 | s = utf8_safe_decode(L, s, &ch); |
467 | goto next; |
468 | } |
469 | s = parse_escape(L, s, e, hex, &ch); |
470 | } |
471 | next: |
472 | add_utf8char(&b, ch); |
473 | } |
474 | luaL_pushresult(&b); |
475 | return 1; |
476 | } |
477 | |
478 | static int Lutf8_insert (lua_State *L) { |
479 | const char *e, *s = check_utf8(L, 1, &e); |
480 | size_t sublen; |
481 | const char *subs; |
482 | luaL_Buffer b; |
483 | int nargs = 2; |
484 | const char *first = e; |
485 | if (lua_type(L, 2) == LUA_TNUMBER) { |
486 | int idx = (int)lua_tointeger(L, 2); |
487 | if (idx != 0) first = utf8_relat(s, e, idx); |
488 | luaL_argcheck(L, first, 2, "invalid index" ); |
489 | ++nargs; |
490 | } |
491 | subs = luaL_checklstring(L, nargs, &sublen); |
492 | luaL_buffinit(L, &b); |
493 | luaL_addlstring(&b, s, first-s); |
494 | luaL_addlstring(&b, subs, sublen); |
495 | luaL_addlstring(&b, first, e-first); |
496 | luaL_pushresult(&b); |
497 | return 1; |
498 | } |
499 | |
500 | static int Lutf8_remove (lua_State *L) { |
501 | const char *e, *s = check_utf8(L, 1, &e); |
502 | lua_Integer posi = luaL_optinteger(L, 2, -1); |
503 | lua_Integer pose = luaL_optinteger(L, 3, -1); |
504 | if (!utf8_range(s, e, &posi, &pose)) |
505 | lua_settop(L, 1); |
506 | else { |
507 | luaL_Buffer b; |
508 | luaL_buffinit(L, &b); |
509 | luaL_addlstring(&b, s, posi); |
510 | luaL_addlstring(&b, s+pose, e-s-pose); |
511 | luaL_pushresult(&b); |
512 | } |
513 | return 1; |
514 | } |
515 | |
516 | static int push_offset (lua_State *L, const char *s, const char *e, lua_Integer offset, lua_Integer idx) { |
517 | utfint ch = 0; |
518 | const char *p; |
519 | if (idx != 0) |
520 | p = utf8_offset(s, e, offset, idx); |
521 | else if (p = s+offset-1, iscont(p)) |
522 | p = utf8_prev(s, p); |
523 | if (p == NULL || p == e) return 0; |
524 | utf8_decode(p, &ch, 0); |
525 | lua_pushinteger(L, p-s+1); |
526 | lua_pushinteger(L, ch); |
527 | return 2; |
528 | } |
529 | |
530 | static int Lutf8_charpos (lua_State *L) { |
531 | const char *e, *s = check_utf8(L, 1, &e); |
532 | lua_Integer offset = 1; |
533 | if (lua_isnoneornil(L, 3)) { |
534 | lua_Integer idx = luaL_optinteger(L, 2, 0); |
535 | if (idx > 0) --idx; |
536 | else if (idx < 0) offset = e-s+1; |
537 | return push_offset(L, s, e, offset, idx); |
538 | } |
539 | offset = byte_relat(luaL_optinteger(L, 2, 1), e-s); |
540 | if (offset < 1) offset = 1; |
541 | return push_offset(L, s, e, offset, luaL_checkinteger(L, 3)); |
542 | } |
543 | |
544 | static int Lutf8_offset (lua_State *L) { |
545 | size_t len; |
546 | const char *s = luaL_checklstring(L, 1, &len); |
547 | lua_Integer n = luaL_checkinteger(L, 2); |
548 | lua_Integer posi = (n >= 0) ? 1 : len + 1; |
549 | posi = byte_relat(luaL_optinteger(L, 3, posi), len); |
550 | luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3, |
551 | "position out of range" ); |
552 | if (n == 0) { |
553 | /* find beginning of current byte sequence */ |
554 | while (posi > 0 && iscont(s + posi)) posi--; |
555 | } else { |
556 | if (iscont(s + posi)) |
557 | return luaL_error(L, "initial position is a continuation byte" ); |
558 | if (n < 0) { |
559 | while (n < 0 && posi > 0) { /* move back */ |
560 | do { /* find beginning of previous character */ |
561 | posi--; |
562 | } while (posi > 0 && iscont(s + posi)); |
563 | n++; |
564 | } |
565 | } else { |
566 | n--; /* do not move for 1st character */ |
567 | while (n > 0 && posi < (lua_Integer)len) { |
568 | do { /* find beginning of next character */ |
569 | posi++; |
570 | } while (iscont(s + posi)); /* (cannot pass final '\0') */ |
571 | n--; |
572 | } |
573 | } |
574 | } |
575 | if (n == 0) /* did it find given character? */ |
576 | lua_pushinteger(L, posi + 1); |
577 | else /* no such character */ |
578 | lua_pushnil(L); |
579 | return 1; |
580 | } |
581 | |
582 | static int Lutf8_next (lua_State *L) { |
583 | const char *e, *s = check_utf8(L, 1, &e); |
584 | lua_Integer offset = byte_relat(luaL_optinteger(L, 2, 1), e-s); |
585 | lua_Integer idx = luaL_optinteger(L, 3, !lua_isnoneornil(L, 2)); |
586 | return push_offset(L, s, e, offset, idx); |
587 | } |
588 | |
589 | static int iter_aux (lua_State *L, int strict) { |
590 | const char *e, *s = check_utf8(L, 1, &e); |
591 | int n = CAST(int, lua_tointeger(L, 2)); |
592 | const char *p = n <= 0 ? s : utf8_next(s+n-1, e); |
593 | if (p < e) { |
594 | utfint code = 0; |
595 | utf8_safe_decode(L, p, &code); |
596 | if (strict && utf8_invalid(code)) |
597 | return luaL_error(L, "invalid UTF-8 code" ); |
598 | lua_pushinteger(L, p-s+1); |
599 | lua_pushinteger(L, code); |
600 | return 2; |
601 | } |
602 | return 0; /* no more codepoints */ |
603 | } |
604 | |
605 | static int iter_auxstrict (lua_State *L) { return iter_aux(L, 1); } |
606 | static int iter_auxlax (lua_State *L) { return iter_aux(L, 0); } |
607 | |
608 | static int Lutf8_codes (lua_State *L) { |
609 | int lax = lua_toboolean(L, 2); |
610 | luaL_checkstring(L, 1); |
611 | lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict); |
612 | lua_pushvalue(L, 1); |
613 | lua_pushinteger(L, 0); |
614 | return 3; |
615 | } |
616 | |
617 | static int Lutf8_width (lua_State *L) { |
618 | int t = lua_type(L, 1); |
619 | int ambi_is_single = !lua_toboolean(L, 2); |
620 | int default_width = CAST(int, luaL_optinteger(L, 3, 0)); |
621 | if (t == LUA_TNUMBER) { |
622 | size_t chwidth = utf8_width(CAST(utfint, lua_tointeger(L, 1)), ambi_is_single); |
623 | if (chwidth == 0) chwidth = default_width; |
624 | lua_pushinteger(L, (lua_Integer)chwidth); |
625 | } else if (t != LUA_TSTRING) |
626 | return typeerror(L, 1, "number/string" ); |
627 | else { |
628 | const char *e, *s = to_utf8(L, 1, &e); |
629 | int width = 0; |
630 | while (s < e) { |
631 | utfint ch = 0; |
632 | int chwidth; |
633 | s = utf8_safe_decode(L, s, &ch); |
634 | chwidth = utf8_width(ch, ambi_is_single); |
635 | width += chwidth == 0 ? default_width : chwidth; |
636 | } |
637 | lua_pushinteger(L, (lua_Integer)width); |
638 | } |
639 | return 1; |
640 | } |
641 | |
642 | static int Lutf8_widthindex (lua_State *L) { |
643 | const char *e, *s = check_utf8(L, 1, &e); |
644 | int width = CAST(int, luaL_checkinteger(L, 2)); |
645 | int ambi_is_single = !lua_toboolean(L, 3); |
646 | int default_width = CAST(int, luaL_optinteger(L, 4, 0)); |
647 | size_t idx = 1; |
648 | while (s < e) { |
649 | utfint ch = 0; |
650 | size_t chwidth; |
651 | s = utf8_safe_decode(L, s, &ch); |
652 | chwidth = utf8_width(ch, ambi_is_single); |
653 | if (chwidth == 0) chwidth = default_width; |
654 | width -= CAST(int, chwidth); |
655 | if (width <= 0) { |
656 | lua_pushinteger(L, idx); |
657 | lua_pushinteger(L, width + chwidth); |
658 | lua_pushinteger(L, chwidth); |
659 | return 3; |
660 | } |
661 | ++idx; |
662 | } |
663 | lua_pushinteger(L, (lua_Integer)idx); |
664 | return 1; |
665 | } |
666 | |
667 | static int Lutf8_ncasecmp (lua_State *L) { |
668 | const char *e1, *s1 = check_utf8(L, 1, &e1); |
669 | const char *e2, *s2 = check_utf8(L, 2, &e2); |
670 | while (s1 < e1 || s2 < e2) { |
671 | utfint ch1 = 0, ch2 = 0; |
672 | if (s1 == e1) |
673 | ch2 = 1; |
674 | else if (s2 == e2) |
675 | ch1 = 1; |
676 | else { |
677 | s1 = utf8_safe_decode(L, s1, &ch1); |
678 | s2 = utf8_safe_decode(L, s2, &ch2); |
679 | ch1 = utf8_tofold(ch1); |
680 | ch2 = utf8_tofold(ch2); |
681 | } |
682 | if (ch1 != ch2) { |
683 | lua_pushinteger(L, ch1 > ch2 ? 1 : -1); |
684 | return 1; |
685 | } |
686 | } |
687 | lua_pushinteger(L, 0); |
688 | return 1; |
689 | } |
690 | |
691 | |
692 | /* utf8 pattern matching implement */ |
693 | |
694 | #ifndef LUA_MAXCAPTURES |
695 | # define LUA_MAXCAPTURES 32 |
696 | #endif /* LUA_MAXCAPTURES */ |
697 | |
698 | #define CAP_UNFINISHED (-1) |
699 | #define CAP_POSITION (-2) |
700 | |
701 | |
702 | typedef struct MatchState { |
703 | int matchdepth; /* control for recursive depth (to avoid C stack overflow) */ |
704 | const char *src_init; /* init of source string */ |
705 | const char *src_end; /* end ('\0') of source string */ |
706 | const char *p_end; /* end ('\0') of pattern */ |
707 | lua_State *L; |
708 | int level; /* total number of captures (finished or unfinished) */ |
709 | struct { |
710 | const char *init; |
711 | ptrdiff_t len; |
712 | } capture[LUA_MAXCAPTURES]; |
713 | } MatchState; |
714 | |
715 | /* recursive function */ |
716 | static const char *match (MatchState *ms, const char *s, const char *p); |
717 | |
718 | /* maximum recursion depth for 'match' */ |
719 | #if !defined(MAXCCALLS) |
720 | #define MAXCCALLS 200 |
721 | #endif |
722 | |
723 | #define L_ESC '%' |
724 | #define SPECIALS "^$*+?.([%-" |
725 | |
726 | static int check_capture (MatchState *ms, int l) { |
727 | l -= '1'; |
728 | if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED) |
729 | return luaL_error(ms->L, "invalid capture index %%%d" , l + 1); |
730 | return l; |
731 | } |
732 | |
733 | static int capture_to_close (MatchState *ms) { |
734 | int level = ms->level; |
735 | while (--level >= 0) |
736 | if (ms->capture[level].len == CAP_UNFINISHED) return level; |
737 | return luaL_error(ms->L, "invalid pattern capture" ); |
738 | } |
739 | |
740 | static const char *classend (MatchState *ms, const char *p) { |
741 | utfint ch = 0; |
742 | p = utf8_safe_decode(ms->L, p, &ch); |
743 | switch (ch) { |
744 | case L_ESC: { |
745 | if (p == ms->p_end) |
746 | luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%" ) ")" ); |
747 | return utf8_next(p, ms->p_end); |
748 | } |
749 | case '[': { |
750 | if (*p == '^') p++; |
751 | do { /* look for a `]' */ |
752 | if (p == ms->p_end) |
753 | luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]" ) ")" ); |
754 | if (*(p++) == L_ESC && p < ms->p_end) |
755 | p++; /* skip escapes (e.g. `%]') */ |
756 | } while (*p != ']'); |
757 | return p+1; |
758 | } |
759 | default: { |
760 | return p; |
761 | } |
762 | } |
763 | } |
764 | |
765 | static int match_class (utfint c, utfint cl) { |
766 | int res; |
767 | switch (utf8_tolower(cl)) { |
768 | #define X(cls, name) case cls: res = utf8_is##name(c); break; |
769 | utf8_categories(X) |
770 | #undef X |
771 | case 'g' : res = utf8_isgraph(c); break; |
772 | case 'w' : res = utf8_isalnum(c); break; |
773 | case 'z' : res = (c == 0); break; /* deprecated option */ |
774 | default: return (cl == c); |
775 | } |
776 | return (utf8_islower(cl) ? res : !res); |
777 | } |
778 | |
779 | static int matchbracketclass (MatchState *ms, utfint c, const char *p, const char *ec) { |
780 | int sig = 1; |
781 | assert(*p == '['); |
782 | if (*++p == '^') { |
783 | sig = 0; |
784 | p++; /* skip the `^' */ |
785 | } |
786 | while (p < ec) { |
787 | utfint ch = 0; |
788 | p = utf8_safe_decode(ms->L, p, &ch); |
789 | if (ch == L_ESC) { |
790 | p = utf8_safe_decode(ms->L, p, &ch); |
791 | if (match_class(c, ch)) |
792 | return sig; |
793 | } else { |
794 | utfint next = 0; |
795 | const char *np = utf8_safe_decode(ms->L, p, &next); |
796 | if (next == '-' && np < ec) { |
797 | p = utf8_safe_decode(ms->L, np, &next); |
798 | if (ch <= c && c <= next) |
799 | return sig; |
800 | } |
801 | else if (ch == c) return sig; |
802 | } |
803 | } |
804 | return !sig; |
805 | } |
806 | |
807 | static int singlematch (MatchState *ms, const char *s, const char *p, const char *ep) { |
808 | if (s >= ms->src_end) |
809 | return 0; |
810 | else { |
811 | utfint ch=0, pch=0; |
812 | utf8_safe_decode(ms->L, s, &ch); |
813 | p = utf8_safe_decode(ms->L, p, &pch); |
814 | switch (pch) { |
815 | case '.': return 1; /* matches any char */ |
816 | case L_ESC: utf8_safe_decode(ms->L, p, &pch); |
817 | return match_class(ch, pch); |
818 | case '[': return matchbracketclass(ms, ch, p-1, ep-1); |
819 | default: return pch == ch; |
820 | } |
821 | } |
822 | } |
823 | |
824 | static const char *matchbalance (MatchState *ms, const char *s, const char **p) { |
825 | utfint ch=0, begin=0, end=0; |
826 | *p = utf8_safe_decode(ms->L, *p, &begin); |
827 | if (*p >= ms->p_end) |
828 | luaL_error(ms->L, "malformed pattern " |
829 | "(missing arguments to " LUA_QL("%%b" ) ")" ); |
830 | *p = utf8_safe_decode(ms->L, *p, &end); |
831 | s = utf8_safe_decode(ms->L, s, &ch); |
832 | if (ch != begin) return NULL; |
833 | else { |
834 | int cont = 1; |
835 | while (s < ms->src_end) { |
836 | s = utf8_safe_decode(ms->L, s, &ch); |
837 | if (ch == end) { |
838 | if (--cont == 0) return s; |
839 | } |
840 | else if (ch == begin) cont++; |
841 | } |
842 | } |
843 | return NULL; /* string ends out of balance */ |
844 | } |
845 | |
846 | static const char *max_expand (MatchState *ms, const char *s, const char *p, const char *ep) { |
847 | const char *m = s; /* matched end of single match p */ |
848 | while (singlematch(ms, m, p, ep)) |
849 | m = utf8_next(m, ms->src_end); |
850 | /* keeps trying to match with the maximum repetitions */ |
851 | while (s <= m) { |
852 | const char *res = match(ms, m, ep+1); |
853 | if (res) return res; |
854 | /* else didn't match; reduce 1 repetition to try again */ |
855 | if (s == m) break; |
856 | m = utf8_prev(s, m); |
857 | } |
858 | return NULL; |
859 | } |
860 | |
861 | static const char *min_expand (MatchState *ms, const char *s, const char *p, const char *ep) { |
862 | for (;;) { |
863 | const char *res = match(ms, s, ep+1); |
864 | if (res != NULL) |
865 | return res; |
866 | else if (singlematch(ms, s, p, ep)) |
867 | s = utf8_next(s, ms->src_end); /* try with one more repetition */ |
868 | else return NULL; |
869 | } |
870 | } |
871 | |
872 | static const char *start_capture (MatchState *ms, const char *s, const char *p, int what) { |
873 | const char *res; |
874 | int level = ms->level; |
875 | if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures" ); |
876 | ms->capture[level].init = s; |
877 | ms->capture[level].len = what; |
878 | ms->level = level+1; |
879 | if ((res=match(ms, s, p)) == NULL) /* match failed? */ |
880 | ms->level--; /* undo capture */ |
881 | return res; |
882 | } |
883 | |
884 | static const char *end_capture (MatchState *ms, const char *s, const char *p) { |
885 | int l = capture_to_close(ms); |
886 | const char *res; |
887 | ms->capture[l].len = s - ms->capture[l].init; /* close capture */ |
888 | if ((res = match(ms, s, p)) == NULL) /* match failed? */ |
889 | ms->capture[l].len = CAP_UNFINISHED; /* undo capture */ |
890 | return res; |
891 | } |
892 | |
893 | static const char *match_capture (MatchState *ms, const char *s, int l) { |
894 | size_t len; |
895 | l = check_capture(ms, l); |
896 | len = ms->capture[l].len; |
897 | if ((size_t)(ms->src_end-s) >= len && |
898 | memcmp(ms->capture[l].init, s, len) == 0) |
899 | return s+len; |
900 | else return NULL; |
901 | } |
902 | |
903 | static const char *match (MatchState *ms, const char *s, const char *p) { |
904 | if (ms->matchdepth-- == 0) |
905 | luaL_error(ms->L, "pattern too complex" ); |
906 | init: /* using goto's to optimize tail recursion */ |
907 | if (p != ms->p_end) { /* end of pattern? */ |
908 | utfint ch = 0; |
909 | utf8_safe_decode(ms->L, p, &ch); |
910 | switch (ch) { |
911 | case '(': { /* start capture */ |
912 | if (*(p + 1) == ')') /* position capture? */ |
913 | s = start_capture(ms, s, p + 2, CAP_POSITION); |
914 | else |
915 | s = start_capture(ms, s, p + 1, CAP_UNFINISHED); |
916 | break; |
917 | } |
918 | case ')': { /* end capture */ |
919 | s = end_capture(ms, s, p + 1); |
920 | break; |
921 | } |
922 | case '$': { |
923 | if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */ |
924 | goto dflt; /* no; go to default */ |
925 | s = (s == ms->src_end) ? s : NULL; /* check end of string */ |
926 | break; |
927 | } |
928 | case L_ESC: { /* escaped sequence not in the format class[*+?-]? */ |
929 | const char *prev_p = p; |
930 | p = utf8_safe_decode(ms->L, p+1, &ch); |
931 | switch (ch) { |
932 | case 'b': { /* balanced string? */ |
933 | s = matchbalance(ms, s, &p); |
934 | if (s != NULL) |
935 | goto init; /* return match(ms, s, p + 4); */ |
936 | /* else fail (s == NULL) */ |
937 | break; |
938 | } |
939 | case 'f': { /* frontier? */ |
940 | const char *ep; utfint previous = 0, current = 0; |
941 | if (*p != '[') |
942 | luaL_error(ms->L, "missing " LUA_QL("[" ) " after " |
943 | LUA_QL("%%f" ) " in pattern" ); |
944 | ep = classend(ms, p); /* points to what is next */ |
945 | if (s != ms->src_init) |
946 | utf8_decode(utf8_prev(ms->src_init, s), &previous, 0); |
947 | if (s != ms->src_end) |
948 | utf8_decode(s, ¤t, 0); |
949 | if (!matchbracketclass(ms, previous, p, ep - 1) && |
950 | matchbracketclass(ms, current, p, ep - 1)) { |
951 | p = ep; goto init; /* return match(ms, s, ep); */ |
952 | } |
953 | s = NULL; /* match failed */ |
954 | break; |
955 | } |
956 | case '0': case '1': case '2': case '3': |
957 | case '4': case '5': case '6': case '7': |
958 | case '8': case '9': { /* capture results (%0-%9)? */ |
959 | s = match_capture(ms, s, ch); |
960 | if (s != NULL) goto init; /* return match(ms, s, p + 2) */ |
961 | break; |
962 | } |
963 | default: p = prev_p; goto dflt; |
964 | } |
965 | break; |
966 | } |
967 | default: dflt: { /* pattern class plus optional suffix */ |
968 | const char *ep = classend(ms, p); /* points to optional suffix */ |
969 | /* does not match at least once? */ |
970 | if (!singlematch(ms, s, p, ep)) { |
971 | if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */ |
972 | p = ep + 1; goto init; /* return match(ms, s, ep + 1); */ |
973 | } else /* '+' or no suffix */ |
974 | s = NULL; /* fail */ |
975 | } else { /* matched once */ |
976 | const char *next_s = utf8_next(s, ms->src_end); |
977 | switch (*ep) { /* handle optional suffix */ |
978 | case '?': { /* optional */ |
979 | const char *res; |
980 | const char *next_ep = utf8_next(ep, ms->p_end); |
981 | if ((res = match(ms, next_s, next_ep)) != NULL) |
982 | s = res; |
983 | else { |
984 | p = next_ep; goto init; /* else return match(ms, s, ep + 1); */ |
985 | } |
986 | break; |
987 | } |
988 | case '+': /* 1 or more repetitions */ |
989 | s = next_s; /* 1 match already done */ |
990 | /* fall through */ |
991 | case '*': /* 0 or more repetitions */ |
992 | s = max_expand(ms, s, p, ep); |
993 | break; |
994 | case '-': /* 0 or more repetitions (minimum) */ |
995 | s = min_expand(ms, s, p, ep); |
996 | break; |
997 | default: /* no suffix */ |
998 | s = next_s; p = ep; goto init; /* return match(ms, s + 1, ep); */ |
999 | } |
1000 | } |
1001 | break; |
1002 | } |
1003 | } |
1004 | } |
1005 | ms->matchdepth++; |
1006 | return s; |
1007 | } |
1008 | |
1009 | static const char *lmemfind (const char *s1, size_t l1, const char *s2, size_t l2) { |
1010 | if (l2 == 0) return s1; /* empty strings are everywhere */ |
1011 | else if (l2 > l1) return NULL; /* avoids a negative `l1' */ |
1012 | else { |
1013 | const char *init; /* to search for a `*s2' inside `s1' */ |
1014 | l2--; /* 1st char will be checked by `memchr' */ |
1015 | l1 = l1-l2; /* `s2' cannot be found after that */ |
1016 | while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) { |
1017 | init++; /* 1st char is already checked */ |
1018 | if (memcmp(init, s2+1, l2) == 0) |
1019 | return init-1; |
1020 | else { /* correct `l1' and `s1' to try again */ |
1021 | l1 -= init-s1; |
1022 | s1 = init; |
1023 | } |
1024 | } |
1025 | return NULL; /* not found */ |
1026 | } |
1027 | } |
1028 | |
1029 | static int get_index (const char *p, const char *s, const char *e) { |
1030 | int idx; |
1031 | for (idx = 0; s < e && s < p; ++idx) |
1032 | s = utf8_next(s, e); |
1033 | return s == p ? idx : idx - 1; |
1034 | } |
1035 | |
1036 | static void push_onecapture (MatchState *ms, int i, const char *s, const char *e) { |
1037 | if (i >= ms->level) { |
1038 | if (i == 0) /* ms->level == 0, too */ |
1039 | lua_pushlstring(ms->L, s, e - s); /* add whole match */ |
1040 | else |
1041 | luaL_error(ms->L, "invalid capture index" ); |
1042 | } else { |
1043 | ptrdiff_t l = ms->capture[i].len; |
1044 | if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture" ); |
1045 | if (l == CAP_POSITION) { |
1046 | int idx = get_index(ms->capture[i].init, ms->src_init, ms->src_end); |
1047 | lua_pushinteger(ms->L, idx+1); |
1048 | } else |
1049 | lua_pushlstring(ms->L, ms->capture[i].init, l); |
1050 | } |
1051 | } |
1052 | |
1053 | static int push_captures (MatchState *ms, const char *s, const char *e) { |
1054 | int i; |
1055 | int nlevels = (ms->level == 0 && s) ? 1 : ms->level; |
1056 | luaL_checkstack(ms->L, nlevels, "too many captures" ); |
1057 | for (i = 0; i < nlevels; i++) |
1058 | push_onecapture(ms, i, s, e); |
1059 | return nlevels; /* number of strings pushed */ |
1060 | } |
1061 | |
1062 | /* check whether pattern has no special characters */ |
1063 | static int nospecials (const char *p, const char * ep) { |
1064 | while (p < ep) { |
1065 | if (strpbrk(p, SPECIALS)) |
1066 | return 0; /* pattern has a special character */ |
1067 | p += strlen(p) + 1; /* may have more after \0 */ |
1068 | } |
1069 | return 1; /* no special chars found */ |
1070 | } |
1071 | |
1072 | |
1073 | /* utf8 pattern matching interface */ |
1074 | |
1075 | static int find_aux (lua_State *L, int find) { |
1076 | const char *es, *s = check_utf8(L, 1, &es); |
1077 | const char *ep, *p = check_utf8(L, 2, &ep); |
1078 | lua_Integer idx = luaL_optinteger(L, 3, 1); |
1079 | const char *init; |
1080 | if (!idx) idx = 1; |
1081 | init = utf8_relat(s, es, CAST(int, idx)); |
1082 | if (init == NULL) { |
1083 | if (idx > 0) { |
1084 | lua_pushnil(L); /* cannot find anything */ |
1085 | return 1; |
1086 | } |
1087 | init = s; |
1088 | } |
1089 | /* explicit request or no special characters? */ |
1090 | if (find && (lua_toboolean(L, 4) || nospecials(p, ep))) { |
1091 | /* do a plain search */ |
1092 | const char *s2 = lmemfind(init, es-init, p, ep-p); |
1093 | if (s2) { |
1094 | const char *e2 = s2 + (ep - p); |
1095 | if (iscont(e2)) e2 = utf8_next(e2, es); |
1096 | lua_pushinteger(L, idx = get_index(s2, s, es) + 1); |
1097 | lua_pushinteger(L, idx + get_index(e2, s2, es) - 1); |
1098 | return 2; |
1099 | } |
1100 | } else { |
1101 | MatchState ms; |
1102 | int anchor = (*p == '^'); |
1103 | if (anchor) p++; /* skip anchor character */ |
1104 | if (idx < 0) idx += utf8_length(s, es)+1; /* TODO not very good */ |
1105 | ms.L = L; |
1106 | ms.matchdepth = MAXCCALLS; |
1107 | ms.src_init = s; |
1108 | ms.src_end = es; |
1109 | ms.p_end = ep; |
1110 | do { |
1111 | const char *res; |
1112 | ms.level = 0; |
1113 | assert(ms.matchdepth == MAXCCALLS); |
1114 | if ((res=match(&ms, init, p)) != NULL) { |
1115 | if (find) { |
1116 | lua_pushinteger(L, idx); /* start */ |
1117 | lua_pushinteger(L, idx + utf8_length(init, res) - 1); /* end */ |
1118 | return push_captures(&ms, NULL, 0) + 2; |
1119 | } else |
1120 | return push_captures(&ms, init, res); |
1121 | } |
1122 | if (init == es) break; |
1123 | idx += 1; |
1124 | init = utf8_next(init, es); |
1125 | } while (init <= es && !anchor); |
1126 | } |
1127 | lua_pushnil(L); /* not found */ |
1128 | return 1; |
1129 | } |
1130 | |
1131 | static int Lutf8_find (lua_State *L) { return find_aux(L, 1); } |
1132 | static int Lutf8_match (lua_State *L) { return find_aux(L, 0); } |
1133 | |
1134 | static int gmatch_aux (lua_State *L) { |
1135 | MatchState ms; |
1136 | const char *es, *s = check_utf8(L, lua_upvalueindex(1), &es); |
1137 | const char *ep, *p = check_utf8(L, lua_upvalueindex(2), &ep); |
1138 | const char *src; |
1139 | ms.L = L; |
1140 | ms.matchdepth = MAXCCALLS; |
1141 | ms.src_init = s; |
1142 | ms.src_end = es; |
1143 | ms.p_end = ep; |
1144 | for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3)); |
1145 | src <= ms.src_end; |
1146 | src = utf8_next(src, ms.src_end)) { |
1147 | const char *e; |
1148 | ms.level = 0; |
1149 | assert(ms.matchdepth == MAXCCALLS); |
1150 | if ((e = match(&ms, src, p)) != NULL) { |
1151 | lua_Integer newstart = e-s; |
1152 | if (e == src) newstart++; /* empty match? go at least one position */ |
1153 | lua_pushinteger(L, newstart); |
1154 | lua_replace(L, lua_upvalueindex(3)); |
1155 | return push_captures(&ms, src, e); |
1156 | } |
1157 | if (src == ms.src_end) break; |
1158 | } |
1159 | return 0; /* not found */ |
1160 | } |
1161 | |
1162 | static int Lutf8_gmatch (lua_State *L) { |
1163 | luaL_checkstring(L, 1); |
1164 | luaL_checkstring(L, 2); |
1165 | lua_settop(L, 2); |
1166 | lua_pushinteger(L, 0); |
1167 | lua_pushcclosure(L, gmatch_aux, 3); |
1168 | return 1; |
1169 | } |
1170 | |
1171 | static void add_s (MatchState *ms, luaL_Buffer *b, const char *s, const char *e) { |
1172 | const char *new_end, *news = to_utf8(ms->L, 3, &new_end); |
1173 | while (news < new_end) { |
1174 | utfint ch = 0; |
1175 | news = utf8_safe_decode(ms->L, news, &ch); |
1176 | if (ch != L_ESC) |
1177 | add_utf8char(b, ch); |
1178 | else { |
1179 | news = utf8_safe_decode(ms->L, news, &ch); /* skip ESC */ |
1180 | if (!utf8_isdigit(ch)) { |
1181 | if (ch != L_ESC) |
1182 | luaL_error(ms->L, "invalid use of " LUA_QL("%c" ) |
1183 | " in replacement string" , L_ESC); |
1184 | add_utf8char(b, ch); |
1185 | } else if (ch == '0') |
1186 | luaL_addlstring(b, s, e-s); |
1187 | else { |
1188 | push_onecapture(ms, ch-'1', s, e); |
1189 | luaL_addvalue(b); /* add capture to accumulated result */ |
1190 | } |
1191 | } |
1192 | } |
1193 | } |
1194 | |
1195 | static void add_value (MatchState *ms, luaL_Buffer *b, const char *s, const char *e, int tr) { |
1196 | lua_State *L = ms->L; |
1197 | switch (tr) { |
1198 | case LUA_TFUNCTION: { |
1199 | int n; |
1200 | lua_pushvalue(L, 3); |
1201 | n = push_captures(ms, s, e); |
1202 | lua_call(L, n, 1); |
1203 | break; |
1204 | } |
1205 | case LUA_TTABLE: { |
1206 | push_onecapture(ms, 0, s, e); |
1207 | lua_gettable(L, 3); |
1208 | break; |
1209 | } |
1210 | default: { /* LUA_TNUMBER or LUA_TSTRING */ |
1211 | add_s(ms, b, s, e); |
1212 | return; |
1213 | } |
1214 | } |
1215 | if (!lua_toboolean(L, -1)) { /* nil or false? */ |
1216 | lua_pop(L, 1); |
1217 | lua_pushlstring(L, s, e - s); /* keep original text */ |
1218 | } else if (!lua_isstring(L, -1)) |
1219 | luaL_error(L, "invalid replacement value (a %s)" , luaL_typename(L, -1)); |
1220 | luaL_addvalue(b); /* add result to accumulator */ |
1221 | } |
1222 | |
1223 | static int Lutf8_gsub (lua_State *L) { |
1224 | const char *es, *s = check_utf8(L, 1, &es); |
1225 | const char *ep, *p = check_utf8(L, 2, &ep); |
1226 | int tr = lua_type(L, 3); |
1227 | lua_Integer max_s = luaL_optinteger(L, 4, (es-s)+1); |
1228 | int anchor = (*p == '^'); |
1229 | lua_Integer n = 0; |
1230 | MatchState ms; |
1231 | luaL_Buffer b; |
1232 | luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING || |
1233 | tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3, |
1234 | "string/function/table expected" ); |
1235 | luaL_buffinit(L, &b); |
1236 | if (anchor) p++; /* skip anchor character */ |
1237 | ms.L = L; |
1238 | ms.matchdepth = MAXCCALLS; |
1239 | ms.src_init = s; |
1240 | ms.src_end = es; |
1241 | ms.p_end = ep; |
1242 | while (n < max_s) { |
1243 | const char *e; |
1244 | ms.level = 0; |
1245 | assert(ms.matchdepth == MAXCCALLS); |
1246 | e = match(&ms, s, p); |
1247 | if (e) { |
1248 | n++; |
1249 | add_value(&ms, &b, s, e, tr); |
1250 | } |
1251 | if (e && e > s) /* non empty match? */ |
1252 | s = e; /* skip it */ |
1253 | else if (s < es) { |
1254 | utfint ch = 0; |
1255 | s = utf8_safe_decode(L, s, &ch); |
1256 | add_utf8char(&b, ch); |
1257 | } else break; |
1258 | if (anchor) break; |
1259 | } |
1260 | luaL_addlstring(&b, s, es-s); |
1261 | luaL_pushresult(&b); |
1262 | lua_pushinteger(L, n); /* number of substitutions */ |
1263 | return 2; |
1264 | } |
1265 | |
1266 | |
1267 | /* lua module import interface */ |
1268 | |
1269 | #if LUA_VERSION_NUM >= 502 |
1270 | static const char UTF8PATT[] = "[\0-\x7F\xC2-\xF4][\x80-\xBF]*" ; |
1271 | #else |
1272 | static const char UTF8PATT[] = "[%z\1-\x7F\xC2-\xF4][\x80-\xBF]*" ; |
1273 | #endif |
1274 | |
1275 | int (lua_State *L) { |
1276 | luaL_Reg libs[] = { |
1277 | #define ENTRY(name) { #name, Lutf8_##name } |
1278 | ENTRY(offset), |
1279 | ENTRY(codes), |
1280 | ENTRY(codepoint), |
1281 | |
1282 | ENTRY(len), |
1283 | ENTRY(sub), |
1284 | ENTRY(reverse), |
1285 | ENTRY(lower), |
1286 | ENTRY(upper), |
1287 | ENTRY(title), |
1288 | ENTRY(fold), |
1289 | ENTRY(byte), |
1290 | ENTRY(char), |
1291 | ENTRY(escape), |
1292 | ENTRY(insert), |
1293 | ENTRY(remove), |
1294 | ENTRY(charpos), |
1295 | ENTRY(next), |
1296 | ENTRY(width), |
1297 | ENTRY(widthindex), |
1298 | ENTRY(ncasecmp), |
1299 | ENTRY(find), |
1300 | ENTRY(gmatch), |
1301 | ENTRY(gsub), |
1302 | ENTRY(match), |
1303 | #undef ENTRY |
1304 | { NULL, NULL } |
1305 | }; |
1306 | |
1307 | luaL_newlib(L, libs); |
1308 | |
1309 | lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)-1); |
1310 | lua_setfield(L, -2, "charpattern" ); |
1311 | |
1312 | return 1; |
1313 | } |
1314 | |