1/*
2 * Integration of https://github.com/starwing/luautf8
3 *
4 * MIT License
5 *
6 * Copyright (c) 2018 Xavier Wang
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in all
16 * copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * SOFTWARE.
25 */
26
27#include <lua.h>
28#include <lauxlib.h>
29#include <lualib.h>
30
31
32#include <assert.h>
33#include <string.h>
34
35#include "../unidata.h"
36
37/* UTF-8 string operations */
38
39#define UTF8_BUFFSZ 8
40#define UTF8_MAX 0x7FFFFFFFu
41#define UTF8_MAXCP 0x10FFFFu
42#define iscont(p) ((*(p) & 0xC0) == 0x80)
43#define CAST(tp,expr) ((tp)(expr))
44
45#ifndef LUA_QL
46# define LUA_QL(x) "'" x "'"
47#endif
48
49static int utf8_invalid (utfint ch)
50{ return (ch > UTF8_MAXCP || (0xD800u <= ch && ch <= 0xDFFFu)); }
51
52static size_t utf8_encode (char *buff, utfint x) {
53 int n = 1; /* number of bytes put in buffer (backwards) */
54 lua_assert(x <= UTF8_MAX);
55 if (x < 0x80) /* ascii? */
56 buff[UTF8_BUFFSZ - 1] = x & 0x7F;
57 else { /* need continuation bytes */
58 utfint mfb = 0x3f; /* maximum that fits in first byte */
59 do { /* add continuation bytes */
60 buff[UTF8_BUFFSZ - (n++)] = 0x80 | (x & 0x3f);
61 x >>= 6; /* remove added bits */
62 mfb >>= 1; /* now there is one less bit available in first byte */
63 } while (x > mfb); /* still needs continuation byte? */
64 buff[UTF8_BUFFSZ - n] = ((~mfb << 1) | x) & 0xFF; /* add first byte */
65 }
66 return n;
67}
68
69static const char *utf8_decode (const char *s, utfint *val, int strict) {
70 static const utfint limits[] =
71 {~0u, 0x80u, 0x800u, 0x10000u, 0x200000u, 0x4000000u};
72 unsigned int c = (unsigned char)s[0];
73 utfint res = 0; /* final result */
74 if (c < 0x80) /* ascii? */
75 res = c;
76 else {
77 int count = 0; /* to count number of continuation bytes */
78 for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */
79 unsigned int cc = (unsigned char)s[++count]; /* read next byte */
80 if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
81 return NULL; /* invalid byte sequence */
82 res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
83 }
84 res |= ((utfint)(c & 0x7F) << (count * 5)); /* add first byte */
85 if (count > 5 || res > UTF8_MAX || res < limits[count])
86 return NULL; /* invalid byte sequence */
87 s += count; /* skip continuation bytes read */
88 }
89 if (strict) {
90 /* check for invalid code points; too large or surrogates */
91 if (res > UTF8_MAXCP || (0xD800u <= res && res <= 0xDFFFu))
92 return NULL;
93 }
94 if (val) *val = res;
95 return s + 1; /* +1 to include first byte */
96}
97
98static const char *utf8_prev (const char *s, const char *e) {
99 while (s < e && iscont(e - 1)) --e;
100 return s < e ? e - 1 : s;
101}
102
103static const char *utf8_next (const char *s, const char *e) {
104 while (s < e && iscont(s + 1)) ++s;
105 return s < e ? s + 1 : e;
106}
107
108static size_t utf8_length (const char *s, const char *e) {
109 size_t i;
110 for (i = 0; s < e; ++i)
111 s = utf8_next(s, e);
112 return i;
113}
114
115static const char *utf8_offset (const char *s, const char *e, lua_Integer offset, lua_Integer idx) {
116 const char *p = s + offset - 1;
117 if (idx >= 0) {
118 while (p < e && idx > 0)
119 p = utf8_next(p, e), --idx;
120 return idx == 0 ? p : NULL;
121 } else {
122 while (s < p && idx < 0)
123 p = utf8_prev(s, p), ++idx;
124 return idx == 0 ? p : NULL;
125 }
126}
127
128static const char *utf8_relat (const char *s, const char *e, int idx) {
129 return idx >= 0 ?
130 utf8_offset(s, e, 1, idx - 1) :
131 utf8_offset(s, e, e-s+1, idx);
132}
133
134static int utf8_range(const char *s, const char *e, lua_Integer *i, lua_Integer *j) {
135 const char *ps = utf8_relat(s, e, CAST(int, *i));
136 const char *pe = utf8_relat(s, e, CAST(int, *j));
137 *i = (ps ? ps : (*i > 0 ? e : s)) - s;
138 *j = (pe ? utf8_next(pe, e) : (*j > 0 ? e : s)) - s;
139 return *i < *j;
140}
141
142
143/* Unicode character categories */
144
145#define table_size(t) (sizeof(t)/sizeof((t)[0]))
146
147#define utf8_categories(X) \
148 X('a', alpha) \
149 X('c', cntrl) \
150 X('d', digit) \
151 X('l', lower) \
152 X('p', punct) \
153 X('s', space) \
154 X('t', compose) \
155 X('u', upper) \
156 X('x', xdigit)
157
158#define utf8_converters(X) \
159 X(lower) \
160 X(upper) \
161 X(title) \
162 X(fold)
163
164static int find_in_range (range_table *t, size_t size, utfint ch) {
165 size_t begin, end;
166
167 begin = 0;
168 end = size;
169
170 while (begin < end) {
171 size_t mid = (begin + end) / 2;
172 if (t[mid].last < ch)
173 begin = mid + 1;
174 else if (t[mid].first > ch)
175 end = mid;
176 else
177 return (ch - t[mid].first) % t[mid].step == 0;
178 }
179
180 return 0;
181}
182
183static int convert_char (conv_table *t, size_t size, utfint ch) {
184 size_t begin, end;
185
186 begin = 0;
187 end = size;
188
189 while (begin < end) {
190 size_t mid = (begin + end) / 2;
191 if (t[mid].last < ch)
192 begin = mid + 1;
193 else if (t[mid].first > ch)
194 end = mid;
195 else if ((ch - t[mid].first) % t[mid].step == 0)
196 return ch + t[mid].offset;
197 else
198 return ch;
199 }
200
201 return ch;
202}
203
204#define define_category(cls, name) static int utf8_is##name (utfint ch)\
205{ return find_in_range(name##_table, table_size(name##_table), ch); }
206#define define_converter(name) static utfint utf8_to##name (utfint ch) \
207{ return convert_char(to##name##_table, table_size(to##name##_table), ch); }
208utf8_categories(define_category)
209utf8_converters(define_converter)
210#undef define_category
211#undef define_converter
212
213static int utf8_isgraph (utfint ch) {
214 if (find_in_range(space_table, table_size(space_table), ch))
215 return 0;
216 if (find_in_range(graph_table, table_size(graph_table), ch))
217 return 1;
218 if (find_in_range(compose_table, table_size(compose_table), ch))
219 return 1;
220 return 0;
221}
222
223static int utf8_isalnum (utfint ch) {
224 if (find_in_range(alpha_table, table_size(alpha_table), ch))
225 return 1;
226 if (find_in_range(alnum_extend_table, table_size(alnum_extend_table), ch))
227 return 1;
228 return 0;
229}
230
231static int utf8_width (utfint ch, int ambi_is_single) {
232 if (find_in_range(doublewidth_table, table_size(doublewidth_table), ch))
233 return 2;
234 if (find_in_range(ambiwidth_table, table_size(ambiwidth_table), ch))
235 return ambi_is_single ? 1 : 2;
236 if (find_in_range(compose_table, table_size(compose_table), ch))
237 return 0;
238 if (find_in_range(unprintable_table, table_size(unprintable_table), ch))
239 return 0;
240 return 1;
241}
242
243
244/* string module compatible interface */
245
246static int typeerror (lua_State *L, int idx, const char *tname)
247{ return luaL_error(L, "%s expected, got %s", tname, luaL_typename(L, idx)); }
248
249static const char *check_utf8 (lua_State *L, int idx, const char **end) {
250 size_t len;
251 const char *s = luaL_checklstring(L, idx, &len);
252 if (end) *end = s+len;
253 return s;
254}
255
256static const char *to_utf8 (lua_State *L, int idx, const char **end) {
257 size_t len;
258 const char *s = lua_tolstring(L, idx, &len);
259 if (end) *end = s+len;
260 return s;
261}
262
263static const char *utf8_safe_decode (lua_State *L, const char *p, utfint *pval) {
264 p = utf8_decode(p, pval, 0);
265 if (p == NULL) luaL_error(L, "invalid UTF-8 code");
266 return p;
267}
268
269static void add_utf8char (luaL_Buffer *b, utfint ch) {
270 char buff[UTF8_BUFFSZ];
271 size_t n = utf8_encode(buff, ch);
272 luaL_addlstring(b, buff+UTF8_BUFFSZ-n, n);
273}
274
275static lua_Integer byte_relat (lua_Integer pos, size_t len) {
276 if (pos >= 0) return pos;
277 else if (0u - (size_t)pos > len) return 0;
278 else return (lua_Integer)len + pos + 1;
279}
280
281static int Lutf8_len (lua_State *L) {
282 size_t len, n;
283 const char *s = luaL_checklstring(L, 1, &len), *p, *e;
284 lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len);
285 lua_Integer pose = byte_relat(luaL_optinteger(L, 3, -1), len);
286 int lax = lua_toboolean(L, 4);
287 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
288 "initial position out of string");
289 luaL_argcheck(L, --pose < (lua_Integer)len, 3,
290 "final position out of string");
291 for (n = 0, p=s+posi, e=s+pose+1; p < e; ++n) {
292 if (lax)
293 p = utf8_next(p, e);
294 else {
295 utfint ch;
296 const char *np = utf8_decode(p, &ch, !lax);
297 if (np == NULL || utf8_invalid(ch)) {
298 lua_pushnil(L);
299 lua_pushinteger(L, p - s + 1);
300 return 2;
301 }
302 p = np;
303 }
304 }
305 lua_pushinteger(L, n);
306 return 1;
307}
308
309static int Lutf8_sub (lua_State *L) {
310 const char *e, *s = check_utf8(L, 1, &e);
311 lua_Integer posi = luaL_checkinteger(L, 2);
312 lua_Integer pose = luaL_optinteger(L, 3, -1);
313 if (utf8_range(s, e, &posi, &pose))
314 lua_pushlstring(L, s+posi, pose-posi);
315 else
316 lua_pushliteral(L, "");
317 return 1;
318}
319
320static int Lutf8_reverse (lua_State *L) {
321 luaL_Buffer b;
322 const char *prev, *pprev, *ends, *e, *s = check_utf8(L, 1, &e);
323 (void) ends;
324 int lax = lua_toboolean(L, 2);
325 luaL_buffinit(L, &b);
326 if (lax) {
327 for (prev = e; s < prev; e = prev) {
328 prev = utf8_prev(s, prev);
329 luaL_addlstring(&b, prev, e-prev);
330 }
331 } else {
332 for (prev = e; s < prev; prev = pprev) {
333 utfint code = 0;
334 ends = utf8_safe_decode(L, pprev = utf8_prev(s, prev), &code);
335 assert(ends == prev);
336 if (utf8_invalid(code))
337 return luaL_error(L, "invalid UTF-8 code");
338 if (!utf8_iscompose(code)) {
339 luaL_addlstring(&b, pprev, e-pprev);
340 e = pprev;
341 }
342 }
343 }
344 luaL_pushresult(&b);
345 return 1;
346}
347
348static int Lutf8_byte (lua_State *L) {
349 size_t n = 0;
350 const char *e, *s = check_utf8(L, 1, &e);
351 lua_Integer posi = luaL_optinteger(L, 2, 1);
352 lua_Integer pose = luaL_optinteger(L, 3, posi);
353 if (utf8_range(s, e, &posi, &pose)) {
354 for (e = s + pose, s = s + posi; s < e; ++n) {
355 utfint ch = 0;
356 s = utf8_safe_decode(L, s, &ch);
357 lua_pushinteger(L, ch);
358 }
359 }
360 return CAST(int, n);
361}
362
363static int Lutf8_codepoint (lua_State *L) {
364 const char *e, *s = check_utf8(L, 1, &e);
365 size_t len = e-s;
366 lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len);
367 lua_Integer pose = byte_relat(luaL_optinteger(L, 3, posi), len);
368 int lax = lua_toboolean(L, 4);
369 int n;
370 const char *se;
371 luaL_argcheck(L, posi >= 1, 2, "out of range");
372 luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range");
373 if (posi > pose) return 0; /* empty interval; return no values */
374 if (pose - posi >= INT_MAX) /* (lua_Integer -> int) overflow? */
375 return luaL_error(L, "string slice too long");
376 n = (int)(pose - posi + 1);
377 luaL_checkstack(L, n, "string slice too long");
378 n = 0; /* count the number of returns */
379 se = s + pose; /* string end */
380 for (s += posi - 1; s < se;) {
381 utfint code = 0;
382 s = utf8_safe_decode(L, s, &code);
383 if (!lax && utf8_invalid(code))
384 return luaL_error(L, "invalid UTF-8 code");
385 lua_pushinteger(L, code);
386 n++;
387 }
388 return n;
389}
390
391static int Lutf8_char (lua_State *L) {
392 int i, n = lua_gettop(L); /* number of arguments */
393 luaL_Buffer b;
394 luaL_buffinit(L, &b);
395 for (i = 1; i <= n; ++i) {
396 lua_Integer code = luaL_checkinteger(L, i);
397 luaL_argcheck(L, code <= UTF8_MAXCP, i, "value out of range");
398 add_utf8char(&b, CAST(utfint, code));
399 }
400 luaL_pushresult(&b);
401 return 1;
402}
403
404#define bind_converter(name) \
405static int Lutf8_##name (lua_State *L) { \
406 int t = lua_type(L, 1); \
407 if (t == LUA_TNUMBER) \
408 lua_pushinteger(L, utf8_to##name(CAST(utfint, lua_tointeger(L, 1)))); \
409 else if (t == LUA_TSTRING) { \
410 luaL_Buffer b; \
411 const char *e, *s = to_utf8(L, 1, &e); \
412 luaL_buffinit(L, &b); \
413 while (s < e) { \
414 utfint ch = 0; \
415 s = utf8_safe_decode(L, s, &ch); \
416 add_utf8char(&b, utf8_to##name(ch)); \
417 } \
418 luaL_pushresult(&b); \
419 } \
420 else return typeerror(L, 1, "number/string"); \
421 return 1; \
422}
423utf8_converters(bind_converter)
424#undef bind_converter
425
426
427/* unicode extra interface */
428
429static const char *parse_escape (lua_State *L, const char *s, const char *e, int hex, utfint *pch) {
430 utfint code = 0;
431 int in_bracket = 0;
432 if (*s == '{') ++s, in_bracket = 1;
433 for (; s < e; ++s) {
434 utfint ch = (unsigned char)*s;
435 if (ch >= '0' && ch <= '9') ch = ch - '0';
436 else if (hex && ch >= 'A' && ch <= 'F') ch = 10 + (ch - 'A');
437 else if (hex && ch >= 'a' && ch <= 'f') ch = 10 + (ch - 'a');
438 else if (!in_bracket) break;
439 else if (ch == '}') { ++s; break; }
440 else luaL_error(L, "invalid escape '%c'", ch);
441 code *= hex ? 16 : 10;
442 code += ch;
443 }
444 *pch = code;
445 return s;
446}
447
448static int Lutf8_escape (lua_State *L) {
449 const char *e, *s = check_utf8(L, 1, &e);
450 luaL_Buffer b;
451 luaL_buffinit(L, &b);
452 while (s < e) {
453 utfint ch = 0;
454 s = utf8_safe_decode(L, s, &ch);
455 if (ch == '%') {
456 int hex = 0;
457 switch (*s) {
458 case '0': case '1': case '2': case '3':
459 case '4': case '5': case '6': case '7':
460 case '8': case '9': case '{':
461 break;
462 case 'x': case 'X': hex = 1; /* fall through */
463 case 'u': case 'U': if (s+1 < e) { ++s; break; }
464 /* fall through */
465 default:
466 s = utf8_safe_decode(L, s, &ch);
467 goto next;
468 }
469 s = parse_escape(L, s, e, hex, &ch);
470 }
471next:
472 add_utf8char(&b, ch);
473 }
474 luaL_pushresult(&b);
475 return 1;
476}
477
478static int Lutf8_insert (lua_State *L) {
479 const char *e, *s = check_utf8(L, 1, &e);
480 size_t sublen;
481 const char *subs;
482 luaL_Buffer b;
483 int nargs = 2;
484 const char *first = e;
485 if (lua_type(L, 2) == LUA_TNUMBER) {
486 int idx = (int)lua_tointeger(L, 2);
487 if (idx != 0) first = utf8_relat(s, e, idx);
488 luaL_argcheck(L, first, 2, "invalid index");
489 ++nargs;
490 }
491 subs = luaL_checklstring(L, nargs, &sublen);
492 luaL_buffinit(L, &b);
493 luaL_addlstring(&b, s, first-s);
494 luaL_addlstring(&b, subs, sublen);
495 luaL_addlstring(&b, first, e-first);
496 luaL_pushresult(&b);
497 return 1;
498}
499
500static int Lutf8_remove (lua_State *L) {
501 const char *e, *s = check_utf8(L, 1, &e);
502 lua_Integer posi = luaL_optinteger(L, 2, -1);
503 lua_Integer pose = luaL_optinteger(L, 3, -1);
504 if (!utf8_range(s, e, &posi, &pose))
505 lua_settop(L, 1);
506 else {
507 luaL_Buffer b;
508 luaL_buffinit(L, &b);
509 luaL_addlstring(&b, s, posi);
510 luaL_addlstring(&b, s+pose, e-s-pose);
511 luaL_pushresult(&b);
512 }
513 return 1;
514}
515
516static int push_offset (lua_State *L, const char *s, const char *e, lua_Integer offset, lua_Integer idx) {
517 utfint ch = 0;
518 const char *p;
519 if (idx != 0)
520 p = utf8_offset(s, e, offset, idx);
521 else if (p = s+offset-1, iscont(p))
522 p = utf8_prev(s, p);
523 if (p == NULL || p == e) return 0;
524 utf8_decode(p, &ch, 0);
525 lua_pushinteger(L, p-s+1);
526 lua_pushinteger(L, ch);
527 return 2;
528}
529
530static int Lutf8_charpos (lua_State *L) {
531 const char *e, *s = check_utf8(L, 1, &e);
532 lua_Integer offset = 1;
533 if (lua_isnoneornil(L, 3)) {
534 lua_Integer idx = luaL_optinteger(L, 2, 0);
535 if (idx > 0) --idx;
536 else if (idx < 0) offset = e-s+1;
537 return push_offset(L, s, e, offset, idx);
538 }
539 offset = byte_relat(luaL_optinteger(L, 2, 1), e-s);
540 if (offset < 1) offset = 1;
541 return push_offset(L, s, e, offset, luaL_checkinteger(L, 3));
542}
543
544static int Lutf8_offset (lua_State *L) {
545 size_t len;
546 const char *s = luaL_checklstring(L, 1, &len);
547 lua_Integer n = luaL_checkinteger(L, 2);
548 lua_Integer posi = (n >= 0) ? 1 : len + 1;
549 posi = byte_relat(luaL_optinteger(L, 3, posi), len);
550 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
551 "position out of range");
552 if (n == 0) {
553 /* find beginning of current byte sequence */
554 while (posi > 0 && iscont(s + posi)) posi--;
555 } else {
556 if (iscont(s + posi))
557 return luaL_error(L, "initial position is a continuation byte");
558 if (n < 0) {
559 while (n < 0 && posi > 0) { /* move back */
560 do { /* find beginning of previous character */
561 posi--;
562 } while (posi > 0 && iscont(s + posi));
563 n++;
564 }
565 } else {
566 n--; /* do not move for 1st character */
567 while (n > 0 && posi < (lua_Integer)len) {
568 do { /* find beginning of next character */
569 posi++;
570 } while (iscont(s + posi)); /* (cannot pass final '\0') */
571 n--;
572 }
573 }
574 }
575 if (n == 0) /* did it find given character? */
576 lua_pushinteger(L, posi + 1);
577 else /* no such character */
578 lua_pushnil(L);
579 return 1;
580}
581
582static int Lutf8_next (lua_State *L) {
583 const char *e, *s = check_utf8(L, 1, &e);
584 lua_Integer offset = byte_relat(luaL_optinteger(L, 2, 1), e-s);
585 lua_Integer idx = luaL_optinteger(L, 3, !lua_isnoneornil(L, 2));
586 return push_offset(L, s, e, offset, idx);
587}
588
589static int iter_aux (lua_State *L, int strict) {
590 const char *e, *s = check_utf8(L, 1, &e);
591 int n = CAST(int, lua_tointeger(L, 2));
592 const char *p = n <= 0 ? s : utf8_next(s+n-1, e);
593 if (p < e) {
594 utfint code = 0;
595 utf8_safe_decode(L, p, &code);
596 if (strict && utf8_invalid(code))
597 return luaL_error(L, "invalid UTF-8 code");
598 lua_pushinteger(L, p-s+1);
599 lua_pushinteger(L, code);
600 return 2;
601 }
602 return 0; /* no more codepoints */
603}
604
605static int iter_auxstrict (lua_State *L) { return iter_aux(L, 1); }
606static int iter_auxlax (lua_State *L) { return iter_aux(L, 0); }
607
608static int Lutf8_codes (lua_State *L) {
609 int lax = lua_toboolean(L, 2);
610 luaL_checkstring(L, 1);
611 lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict);
612 lua_pushvalue(L, 1);
613 lua_pushinteger(L, 0);
614 return 3;
615}
616
617static int Lutf8_width (lua_State *L) {
618 int t = lua_type(L, 1);
619 int ambi_is_single = !lua_toboolean(L, 2);
620 int default_width = CAST(int, luaL_optinteger(L, 3, 0));
621 if (t == LUA_TNUMBER) {
622 size_t chwidth = utf8_width(CAST(utfint, lua_tointeger(L, 1)), ambi_is_single);
623 if (chwidth == 0) chwidth = default_width;
624 lua_pushinteger(L, (lua_Integer)chwidth);
625 } else if (t != LUA_TSTRING)
626 return typeerror(L, 1, "number/string");
627 else {
628 const char *e, *s = to_utf8(L, 1, &e);
629 int width = 0;
630 while (s < e) {
631 utfint ch = 0;
632 int chwidth;
633 s = utf8_safe_decode(L, s, &ch);
634 chwidth = utf8_width(ch, ambi_is_single);
635 width += chwidth == 0 ? default_width : chwidth;
636 }
637 lua_pushinteger(L, (lua_Integer)width);
638 }
639 return 1;
640}
641
642static int Lutf8_widthindex (lua_State *L) {
643 const char *e, *s = check_utf8(L, 1, &e);
644 int width = CAST(int, luaL_checkinteger(L, 2));
645 int ambi_is_single = !lua_toboolean(L, 3);
646 int default_width = CAST(int, luaL_optinteger(L, 4, 0));
647 size_t idx = 1;
648 while (s < e) {
649 utfint ch = 0;
650 size_t chwidth;
651 s = utf8_safe_decode(L, s, &ch);
652 chwidth = utf8_width(ch, ambi_is_single);
653 if (chwidth == 0) chwidth = default_width;
654 width -= CAST(int, chwidth);
655 if (width <= 0) {
656 lua_pushinteger(L, idx);
657 lua_pushinteger(L, width + chwidth);
658 lua_pushinteger(L, chwidth);
659 return 3;
660 }
661 ++idx;
662 }
663 lua_pushinteger(L, (lua_Integer)idx);
664 return 1;
665}
666
667static int Lutf8_ncasecmp (lua_State *L) {
668 const char *e1, *s1 = check_utf8(L, 1, &e1);
669 const char *e2, *s2 = check_utf8(L, 2, &e2);
670 while (s1 < e1 || s2 < e2) {
671 utfint ch1 = 0, ch2 = 0;
672 if (s1 == e1)
673 ch2 = 1;
674 else if (s2 == e2)
675 ch1 = 1;
676 else {
677 s1 = utf8_safe_decode(L, s1, &ch1);
678 s2 = utf8_safe_decode(L, s2, &ch2);
679 ch1 = utf8_tofold(ch1);
680 ch2 = utf8_tofold(ch2);
681 }
682 if (ch1 != ch2) {
683 lua_pushinteger(L, ch1 > ch2 ? 1 : -1);
684 return 1;
685 }
686 }
687 lua_pushinteger(L, 0);
688 return 1;
689}
690
691
692/* utf8 pattern matching implement */
693
694#ifndef LUA_MAXCAPTURES
695# define LUA_MAXCAPTURES 32
696#endif /* LUA_MAXCAPTURES */
697
698#define CAP_UNFINISHED (-1)
699#define CAP_POSITION (-2)
700
701
702typedef struct MatchState {
703 int matchdepth; /* control for recursive depth (to avoid C stack overflow) */
704 const char *src_init; /* init of source string */
705 const char *src_end; /* end ('\0') of source string */
706 const char *p_end; /* end ('\0') of pattern */
707 lua_State *L;
708 int level; /* total number of captures (finished or unfinished) */
709 struct {
710 const char *init;
711 ptrdiff_t len;
712 } capture[LUA_MAXCAPTURES];
713} MatchState;
714
715/* recursive function */
716static const char *match (MatchState *ms, const char *s, const char *p);
717
718/* maximum recursion depth for 'match' */
719#if !defined(MAXCCALLS)
720#define MAXCCALLS 200
721#endif
722
723#define L_ESC '%'
724#define SPECIALS "^$*+?.([%-"
725
726static int check_capture (MatchState *ms, int l) {
727 l -= '1';
728 if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
729 return luaL_error(ms->L, "invalid capture index %%%d", l + 1);
730 return l;
731}
732
733static int capture_to_close (MatchState *ms) {
734 int level = ms->level;
735 while (--level >= 0)
736 if (ms->capture[level].len == CAP_UNFINISHED) return level;
737 return luaL_error(ms->L, "invalid pattern capture");
738}
739
740static const char *classend (MatchState *ms, const char *p) {
741 utfint ch = 0;
742 p = utf8_safe_decode(ms->L, p, &ch);
743 switch (ch) {
744 case L_ESC: {
745 if (p == ms->p_end)
746 luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")");
747 return utf8_next(p, ms->p_end);
748 }
749 case '[': {
750 if (*p == '^') p++;
751 do { /* look for a `]' */
752 if (p == ms->p_end)
753 luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")");
754 if (*(p++) == L_ESC && p < ms->p_end)
755 p++; /* skip escapes (e.g. `%]') */
756 } while (*p != ']');
757 return p+1;
758 }
759 default: {
760 return p;
761 }
762 }
763}
764
765static int match_class (utfint c, utfint cl) {
766 int res;
767 switch (utf8_tolower(cl)) {
768#define X(cls, name) case cls: res = utf8_is##name(c); break;
769 utf8_categories(X)
770#undef X
771 case 'g' : res = utf8_isgraph(c); break;
772 case 'w' : res = utf8_isalnum(c); break;
773 case 'z' : res = (c == 0); break; /* deprecated option */
774 default: return (cl == c);
775 }
776 return (utf8_islower(cl) ? res : !res);
777}
778
779static int matchbracketclass (MatchState *ms, utfint c, const char *p, const char *ec) {
780 int sig = 1;
781 assert(*p == '[');
782 if (*++p == '^') {
783 sig = 0;
784 p++; /* skip the `^' */
785 }
786 while (p < ec) {
787 utfint ch = 0;
788 p = utf8_safe_decode(ms->L, p, &ch);
789 if (ch == L_ESC) {
790 p = utf8_safe_decode(ms->L, p, &ch);
791 if (match_class(c, ch))
792 return sig;
793 } else {
794 utfint next = 0;
795 const char *np = utf8_safe_decode(ms->L, p, &next);
796 if (next == '-' && np < ec) {
797 p = utf8_safe_decode(ms->L, np, &next);
798 if (ch <= c && c <= next)
799 return sig;
800 }
801 else if (ch == c) return sig;
802 }
803 }
804 return !sig;
805}
806
807static int singlematch (MatchState *ms, const char *s, const char *p, const char *ep) {
808 if (s >= ms->src_end)
809 return 0;
810 else {
811 utfint ch=0, pch=0;
812 utf8_safe_decode(ms->L, s, &ch);
813 p = utf8_safe_decode(ms->L, p, &pch);
814 switch (pch) {
815 case '.': return 1; /* matches any char */
816 case L_ESC: utf8_safe_decode(ms->L, p, &pch);
817 return match_class(ch, pch);
818 case '[': return matchbracketclass(ms, ch, p-1, ep-1);
819 default: return pch == ch;
820 }
821 }
822}
823
824static const char *matchbalance (MatchState *ms, const char *s, const char **p) {
825 utfint ch=0, begin=0, end=0;
826 *p = utf8_safe_decode(ms->L, *p, &begin);
827 if (*p >= ms->p_end)
828 luaL_error(ms->L, "malformed pattern "
829 "(missing arguments to " LUA_QL("%%b") ")");
830 *p = utf8_safe_decode(ms->L, *p, &end);
831 s = utf8_safe_decode(ms->L, s, &ch);
832 if (ch != begin) return NULL;
833 else {
834 int cont = 1;
835 while (s < ms->src_end) {
836 s = utf8_safe_decode(ms->L, s, &ch);
837 if (ch == end) {
838 if (--cont == 0) return s;
839 }
840 else if (ch == begin) cont++;
841 }
842 }
843 return NULL; /* string ends out of balance */
844}
845
846static const char *max_expand (MatchState *ms, const char *s, const char *p, const char *ep) {
847 const char *m = s; /* matched end of single match p */
848 while (singlematch(ms, m, p, ep))
849 m = utf8_next(m, ms->src_end);
850 /* keeps trying to match with the maximum repetitions */
851 while (s <= m) {
852 const char *res = match(ms, m, ep+1);
853 if (res) return res;
854 /* else didn't match; reduce 1 repetition to try again */
855 if (s == m) break;
856 m = utf8_prev(s, m);
857 }
858 return NULL;
859}
860
861static const char *min_expand (MatchState *ms, const char *s, const char *p, const char *ep) {
862 for (;;) {
863 const char *res = match(ms, s, ep+1);
864 if (res != NULL)
865 return res;
866 else if (singlematch(ms, s, p, ep))
867 s = utf8_next(s, ms->src_end); /* try with one more repetition */
868 else return NULL;
869 }
870}
871
872static const char *start_capture (MatchState *ms, const char *s, const char *p, int what) {
873 const char *res;
874 int level = ms->level;
875 if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures");
876 ms->capture[level].init = s;
877 ms->capture[level].len = what;
878 ms->level = level+1;
879 if ((res=match(ms, s, p)) == NULL) /* match failed? */
880 ms->level--; /* undo capture */
881 return res;
882}
883
884static const char *end_capture (MatchState *ms, const char *s, const char *p) {
885 int l = capture_to_close(ms);
886 const char *res;
887 ms->capture[l].len = s - ms->capture[l].init; /* close capture */
888 if ((res = match(ms, s, p)) == NULL) /* match failed? */
889 ms->capture[l].len = CAP_UNFINISHED; /* undo capture */
890 return res;
891}
892
893static const char *match_capture (MatchState *ms, const char *s, int l) {
894 size_t len;
895 l = check_capture(ms, l);
896 len = ms->capture[l].len;
897 if ((size_t)(ms->src_end-s) >= len &&
898 memcmp(ms->capture[l].init, s, len) == 0)
899 return s+len;
900 else return NULL;
901}
902
903static const char *match (MatchState *ms, const char *s, const char *p) {
904 if (ms->matchdepth-- == 0)
905 luaL_error(ms->L, "pattern too complex");
906 init: /* using goto's to optimize tail recursion */
907 if (p != ms->p_end) { /* end of pattern? */
908 utfint ch = 0;
909 utf8_safe_decode(ms->L, p, &ch);
910 switch (ch) {
911 case '(': { /* start capture */
912 if (*(p + 1) == ')') /* position capture? */
913 s = start_capture(ms, s, p + 2, CAP_POSITION);
914 else
915 s = start_capture(ms, s, p + 1, CAP_UNFINISHED);
916 break;
917 }
918 case ')': { /* end capture */
919 s = end_capture(ms, s, p + 1);
920 break;
921 }
922 case '$': {
923 if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */
924 goto dflt; /* no; go to default */
925 s = (s == ms->src_end) ? s : NULL; /* check end of string */
926 break;
927 }
928 case L_ESC: { /* escaped sequence not in the format class[*+?-]? */
929 const char *prev_p = p;
930 p = utf8_safe_decode(ms->L, p+1, &ch);
931 switch (ch) {
932 case 'b': { /* balanced string? */
933 s = matchbalance(ms, s, &p);
934 if (s != NULL)
935 goto init; /* return match(ms, s, p + 4); */
936 /* else fail (s == NULL) */
937 break;
938 }
939 case 'f': { /* frontier? */
940 const char *ep; utfint previous = 0, current = 0;
941 if (*p != '[')
942 luaL_error(ms->L, "missing " LUA_QL("[") " after "
943 LUA_QL("%%f") " in pattern");
944 ep = classend(ms, p); /* points to what is next */
945 if (s != ms->src_init)
946 utf8_decode(utf8_prev(ms->src_init, s), &previous, 0);
947 if (s != ms->src_end)
948 utf8_decode(s, &current, 0);
949 if (!matchbracketclass(ms, previous, p, ep - 1) &&
950 matchbracketclass(ms, current, p, ep - 1)) {
951 p = ep; goto init; /* return match(ms, s, ep); */
952 }
953 s = NULL; /* match failed */
954 break;
955 }
956 case '0': case '1': case '2': case '3':
957 case '4': case '5': case '6': case '7':
958 case '8': case '9': { /* capture results (%0-%9)? */
959 s = match_capture(ms, s, ch);
960 if (s != NULL) goto init; /* return match(ms, s, p + 2) */
961 break;
962 }
963 default: p = prev_p; goto dflt;
964 }
965 break;
966 }
967 default: dflt: { /* pattern class plus optional suffix */
968 const char *ep = classend(ms, p); /* points to optional suffix */
969 /* does not match at least once? */
970 if (!singlematch(ms, s, p, ep)) {
971 if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */
972 p = ep + 1; goto init; /* return match(ms, s, ep + 1); */
973 } else /* '+' or no suffix */
974 s = NULL; /* fail */
975 } else { /* matched once */
976 const char *next_s = utf8_next(s, ms->src_end);
977 switch (*ep) { /* handle optional suffix */
978 case '?': { /* optional */
979 const char *res;
980 const char *next_ep = utf8_next(ep, ms->p_end);
981 if ((res = match(ms, next_s, next_ep)) != NULL)
982 s = res;
983 else {
984 p = next_ep; goto init; /* else return match(ms, s, ep + 1); */
985 }
986 break;
987 }
988 case '+': /* 1 or more repetitions */
989 s = next_s; /* 1 match already done */
990 /* fall through */
991 case '*': /* 0 or more repetitions */
992 s = max_expand(ms, s, p, ep);
993 break;
994 case '-': /* 0 or more repetitions (minimum) */
995 s = min_expand(ms, s, p, ep);
996 break;
997 default: /* no suffix */
998 s = next_s; p = ep; goto init; /* return match(ms, s + 1, ep); */
999 }
1000 }
1001 break;
1002 }
1003 }
1004 }
1005 ms->matchdepth++;
1006 return s;
1007}
1008
1009static const char *lmemfind (const char *s1, size_t l1, const char *s2, size_t l2) {
1010 if (l2 == 0) return s1; /* empty strings are everywhere */
1011 else if (l2 > l1) return NULL; /* avoids a negative `l1' */
1012 else {
1013 const char *init; /* to search for a `*s2' inside `s1' */
1014 l2--; /* 1st char will be checked by `memchr' */
1015 l1 = l1-l2; /* `s2' cannot be found after that */
1016 while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
1017 init++; /* 1st char is already checked */
1018 if (memcmp(init, s2+1, l2) == 0)
1019 return init-1;
1020 else { /* correct `l1' and `s1' to try again */
1021 l1 -= init-s1;
1022 s1 = init;
1023 }
1024 }
1025 return NULL; /* not found */
1026 }
1027}
1028
1029static int get_index (const char *p, const char *s, const char *e) {
1030 int idx;
1031 for (idx = 0; s < e && s < p; ++idx)
1032 s = utf8_next(s, e);
1033 return s == p ? idx : idx - 1;
1034}
1035
1036static void push_onecapture (MatchState *ms, int i, const char *s, const char *e) {
1037 if (i >= ms->level) {
1038 if (i == 0) /* ms->level == 0, too */
1039 lua_pushlstring(ms->L, s, e - s); /* add whole match */
1040 else
1041 luaL_error(ms->L, "invalid capture index");
1042 } else {
1043 ptrdiff_t l = ms->capture[i].len;
1044 if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture");
1045 if (l == CAP_POSITION) {
1046 int idx = get_index(ms->capture[i].init, ms->src_init, ms->src_end);
1047 lua_pushinteger(ms->L, idx+1);
1048 } else
1049 lua_pushlstring(ms->L, ms->capture[i].init, l);
1050 }
1051}
1052
1053static int push_captures (MatchState *ms, const char *s, const char *e) {
1054 int i;
1055 int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
1056 luaL_checkstack(ms->L, nlevels, "too many captures");
1057 for (i = 0; i < nlevels; i++)
1058 push_onecapture(ms, i, s, e);
1059 return nlevels; /* number of strings pushed */
1060}
1061
1062/* check whether pattern has no special characters */
1063static int nospecials (const char *p, const char * ep) {
1064 while (p < ep) {
1065 if (strpbrk(p, SPECIALS))
1066 return 0; /* pattern has a special character */
1067 p += strlen(p) + 1; /* may have more after \0 */
1068 }
1069 return 1; /* no special chars found */
1070}
1071
1072
1073/* utf8 pattern matching interface */
1074
1075static int find_aux (lua_State *L, int find) {
1076 const char *es, *s = check_utf8(L, 1, &es);
1077 const char *ep, *p = check_utf8(L, 2, &ep);
1078 lua_Integer idx = luaL_optinteger(L, 3, 1);
1079 const char *init;
1080 if (!idx) idx = 1;
1081 init = utf8_relat(s, es, CAST(int, idx));
1082 if (init == NULL) {
1083 if (idx > 0) {
1084 lua_pushnil(L); /* cannot find anything */
1085 return 1;
1086 }
1087 init = s;
1088 }
1089 /* explicit request or no special characters? */
1090 if (find && (lua_toboolean(L, 4) || nospecials(p, ep))) {
1091 /* do a plain search */
1092 const char *s2 = lmemfind(init, es-init, p, ep-p);
1093 if (s2) {
1094 const char *e2 = s2 + (ep - p);
1095 if (iscont(e2)) e2 = utf8_next(e2, es);
1096 lua_pushinteger(L, idx = get_index(s2, s, es) + 1);
1097 lua_pushinteger(L, idx + get_index(e2, s2, es) - 1);
1098 return 2;
1099 }
1100 } else {
1101 MatchState ms;
1102 int anchor = (*p == '^');
1103 if (anchor) p++; /* skip anchor character */
1104 if (idx < 0) idx += utf8_length(s, es)+1; /* TODO not very good */
1105 ms.L = L;
1106 ms.matchdepth = MAXCCALLS;
1107 ms.src_init = s;
1108 ms.src_end = es;
1109 ms.p_end = ep;
1110 do {
1111 const char *res;
1112 ms.level = 0;
1113 assert(ms.matchdepth == MAXCCALLS);
1114 if ((res=match(&ms, init, p)) != NULL) {
1115 if (find) {
1116 lua_pushinteger(L, idx); /* start */
1117 lua_pushinteger(L, idx + utf8_length(init, res) - 1); /* end */
1118 return push_captures(&ms, NULL, 0) + 2;
1119 } else
1120 return push_captures(&ms, init, res);
1121 }
1122 if (init == es) break;
1123 idx += 1;
1124 init = utf8_next(init, es);
1125 } while (init <= es && !anchor);
1126 }
1127 lua_pushnil(L); /* not found */
1128 return 1;
1129}
1130
1131static int Lutf8_find (lua_State *L) { return find_aux(L, 1); }
1132static int Lutf8_match (lua_State *L) { return find_aux(L, 0); }
1133
1134static int gmatch_aux (lua_State *L) {
1135 MatchState ms;
1136 const char *es, *s = check_utf8(L, lua_upvalueindex(1), &es);
1137 const char *ep, *p = check_utf8(L, lua_upvalueindex(2), &ep);
1138 const char *src;
1139 ms.L = L;
1140 ms.matchdepth = MAXCCALLS;
1141 ms.src_init = s;
1142 ms.src_end = es;
1143 ms.p_end = ep;
1144 for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3));
1145 src <= ms.src_end;
1146 src = utf8_next(src, ms.src_end)) {
1147 const char *e;
1148 ms.level = 0;
1149 assert(ms.matchdepth == MAXCCALLS);
1150 if ((e = match(&ms, src, p)) != NULL) {
1151 lua_Integer newstart = e-s;
1152 if (e == src) newstart++; /* empty match? go at least one position */
1153 lua_pushinteger(L, newstart);
1154 lua_replace(L, lua_upvalueindex(3));
1155 return push_captures(&ms, src, e);
1156 }
1157 if (src == ms.src_end) break;
1158 }
1159 return 0; /* not found */
1160}
1161
1162static int Lutf8_gmatch (lua_State *L) {
1163 luaL_checkstring(L, 1);
1164 luaL_checkstring(L, 2);
1165 lua_settop(L, 2);
1166 lua_pushinteger(L, 0);
1167 lua_pushcclosure(L, gmatch_aux, 3);
1168 return 1;
1169}
1170
1171static void add_s (MatchState *ms, luaL_Buffer *b, const char *s, const char *e) {
1172 const char *new_end, *news = to_utf8(ms->L, 3, &new_end);
1173 while (news < new_end) {
1174 utfint ch = 0;
1175 news = utf8_safe_decode(ms->L, news, &ch);
1176 if (ch != L_ESC)
1177 add_utf8char(b, ch);
1178 else {
1179 news = utf8_safe_decode(ms->L, news, &ch); /* skip ESC */
1180 if (!utf8_isdigit(ch)) {
1181 if (ch != L_ESC)
1182 luaL_error(ms->L, "invalid use of " LUA_QL("%c")
1183 " in replacement string", L_ESC);
1184 add_utf8char(b, ch);
1185 } else if (ch == '0')
1186 luaL_addlstring(b, s, e-s);
1187 else {
1188 push_onecapture(ms, ch-'1', s, e);
1189 luaL_addvalue(b); /* add capture to accumulated result */
1190 }
1191 }
1192 }
1193}
1194
1195static void add_value (MatchState *ms, luaL_Buffer *b, const char *s, const char *e, int tr) {
1196 lua_State *L = ms->L;
1197 switch (tr) {
1198 case LUA_TFUNCTION: {
1199 int n;
1200 lua_pushvalue(L, 3);
1201 n = push_captures(ms, s, e);
1202 lua_call(L, n, 1);
1203 break;
1204 }
1205 case LUA_TTABLE: {
1206 push_onecapture(ms, 0, s, e);
1207 lua_gettable(L, 3);
1208 break;
1209 }
1210 default: { /* LUA_TNUMBER or LUA_TSTRING */
1211 add_s(ms, b, s, e);
1212 return;
1213 }
1214 }
1215 if (!lua_toboolean(L, -1)) { /* nil or false? */
1216 lua_pop(L, 1);
1217 lua_pushlstring(L, s, e - s); /* keep original text */
1218 } else if (!lua_isstring(L, -1))
1219 luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1));
1220 luaL_addvalue(b); /* add result to accumulator */
1221}
1222
1223static int Lutf8_gsub (lua_State *L) {
1224 const char *es, *s = check_utf8(L, 1, &es);
1225 const char *ep, *p = check_utf8(L, 2, &ep);
1226 int tr = lua_type(L, 3);
1227 lua_Integer max_s = luaL_optinteger(L, 4, (es-s)+1);
1228 int anchor = (*p == '^');
1229 lua_Integer n = 0;
1230 MatchState ms;
1231 luaL_Buffer b;
1232 luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING ||
1233 tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3,
1234 "string/function/table expected");
1235 luaL_buffinit(L, &b);
1236 if (anchor) p++; /* skip anchor character */
1237 ms.L = L;
1238 ms.matchdepth = MAXCCALLS;
1239 ms.src_init = s;
1240 ms.src_end = es;
1241 ms.p_end = ep;
1242 while (n < max_s) {
1243 const char *e;
1244 ms.level = 0;
1245 assert(ms.matchdepth == MAXCCALLS);
1246 e = match(&ms, s, p);
1247 if (e) {
1248 n++;
1249 add_value(&ms, &b, s, e, tr);
1250 }
1251 if (e && e > s) /* non empty match? */
1252 s = e; /* skip it */
1253 else if (s < es) {
1254 utfint ch = 0;
1255 s = utf8_safe_decode(L, s, &ch);
1256 add_utf8char(&b, ch);
1257 } else break;
1258 if (anchor) break;
1259 }
1260 luaL_addlstring(&b, s, es-s);
1261 luaL_pushresult(&b);
1262 lua_pushinteger(L, n); /* number of substitutions */
1263 return 2;
1264}
1265
1266
1267/* lua module import interface */
1268
1269#if LUA_VERSION_NUM >= 502
1270static const char UTF8PATT[] = "[\0-\x7F\xC2-\xF4][\x80-\xBF]*";
1271#else
1272static const char UTF8PATT[] = "[%z\1-\x7F\xC2-\xF4][\x80-\xBF]*";
1273#endif
1274
1275int luaopen_utf8extra (lua_State *L) {
1276 luaL_Reg libs[] = {
1277#define ENTRY(name) { #name, Lutf8_##name }
1278 ENTRY(offset),
1279 ENTRY(codes),
1280 ENTRY(codepoint),
1281
1282 ENTRY(len),
1283 ENTRY(sub),
1284 ENTRY(reverse),
1285 ENTRY(lower),
1286 ENTRY(upper),
1287 ENTRY(title),
1288 ENTRY(fold),
1289 ENTRY(byte),
1290 ENTRY(char),
1291 ENTRY(escape),
1292 ENTRY(insert),
1293 ENTRY(remove),
1294 ENTRY(charpos),
1295 ENTRY(next),
1296 ENTRY(width),
1297 ENTRY(widthindex),
1298 ENTRY(ncasecmp),
1299 ENTRY(find),
1300 ENTRY(gmatch),
1301 ENTRY(gsub),
1302 ENTRY(match),
1303#undef ENTRY
1304 { NULL, NULL }
1305 };
1306
1307 luaL_newlib(L, libs);
1308
1309 lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)-1);
1310 lua_setfield(L, -2, "charpattern");
1311
1312 return 1;
1313}
1314