1/*
2** $Id: lutf8lib.c,v 1.13 2014/11/02 19:19:04 roberto Exp $
3** Standard library for UTF-8 manipulation
4** Modified by the LOVE Development Team to work with Lua 5.1's API
5*/
6
7/******************************************************************************
8 * Copyright (C) 1994-2015 Lua.org, PUC-Rio, 2015 LOVE Development Team.
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining
11 * a copy of this software and associated documentation files (the
12 * "Software"), to deal in the Software without restriction, including
13 * without limitation the rights to use, copy, modify, merge, publish,
14 * distribute, sublicense, and/or sell copies of the Software, and to
15 * permit persons to whom the Software is furnished to do so, subject to
16 * the following conditions:
17 *
18 * The above copyright notice and this permission notice shall be
19 * included in all copies or substantial portions of the Software.
20 *
21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 ******************************************************************************/
29
30#define lutf8lib_c
31
32#include "lprefix.h"
33
34
35#include <assert.h>
36#include <stdlib.h>
37#include <string.h>
38
39#include "lutf8lib.h"
40
41#include "lauxlib.h"
42#include "lualib.h"
43
44#define MAXUNICODE 0x10FFFF
45
46/* size of buffer for 'utf8esc' function (taken from lobject.h) */
47#define UTF8BUFFSZ 8
48
49#define iscont(p) ((*(p) & 0xC0) == 0x80)
50
51
52/* from strlib */
53/* translate a relative string position: negative means back from end */
54static lua_Integer u_posrelat (lua_Integer pos, size_t len) {
55 if (pos >= 0) return pos;
56 else if (0u - (size_t)pos > len) return 0;
57 else return (lua_Integer)len + pos + 1;
58}
59
60
61/*
62** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
63*/
64static const char *utf8_decode (const char *o, int *val) {
65 static unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
66 const unsigned char *s = (const unsigned char *)o;
67 unsigned int c = s[0];
68 unsigned int res = 0; /* final result */
69 if (c < 0x80) /* ascii? */
70 res = c;
71 else {
72 int count = 0; /* to count number of continuation bytes */
73 while (c & 0x40) { /* still have continuation bytes? */
74 int cc = s[++count]; /* read next byte */
75 if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
76 return NULL; /* invalid byte sequence */
77 res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
78 c <<= 1; /* to test next bit */
79 }
80 res |= ((c & 0x7F) << (count * 5)); /* add first byte */
81 if (count > 3 || res > MAXUNICODE || res <= limits[count])
82 return NULL; /* invalid byte sequence */
83 s += count; /* skip continuation bytes read */
84 }
85 if (val) *val = res;
86 return (const char *)s + 1; /* +1 to include first byte */
87}
88
89
90/*
91** utf8len(s [, i [, j]]) --> number of characters that start in the
92** range [i,j], or nil + current position if 's' is not well formed in
93** that interval
94*/
95static int utflen (lua_State *L) {
96 int n = 0;
97 size_t len;
98 const char *s = luaL_checklstring(L, 1, &len);
99 lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
100 lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
101 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
102 "initial position out of string");
103 luaL_argcheck(L, --posj < (lua_Integer)len, 3,
104 "final position out of string");
105 while (posi <= posj) {
106 const char *s1 = utf8_decode(s + posi, NULL);
107 if (s1 == NULL) { /* conversion error? */
108 lua_pushnil(L); /* return nil ... */
109 lua_pushinteger(L, posi + 1); /* ... and current position */
110 return 2;
111 }
112 posi = s1 - s;
113 n++;
114 }
115 lua_pushinteger(L, n);
116 return 1;
117}
118
119
120/*
121** codepoint(s, [i, [j]]) -> returns codepoints for all characters
122** that start in the range [i,j]
123*/
124static int codepoint (lua_State *L) {
125 size_t len;
126 const char *s = luaL_checklstring(L, 1, &len);
127 lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
128 lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
129 int n;
130 const char *se;
131 luaL_argcheck(L, posi >= 1, 2, "out of range");
132 luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range");
133 if (posi > pose) return 0; /* empty interval; return no values */
134 n = (int)(pose - posi + 1);
135 if (posi + n <= pose) /* (lua_Integer -> int) overflow? */
136 return luaL_error(L, "string slice too long");
137 luaL_checkstack(L, n, "string slice too long");
138 n = 0;
139 se = s + pose;
140 for (s += posi - 1; s < se;) {
141 int code;
142 s = utf8_decode(s, &code);
143 if (s == NULL)
144 return luaL_error(L, "invalid UTF-8 code");
145 lua_pushinteger(L, code);
146 n++;
147 }
148 return n;
149}
150
151
152/* taken from lobject.c */
153static int utf8esc (char *buff, unsigned long x) {
154 int n = 1; /* number of bytes put in buffer (backwards) */
155 lua_assert(x <= 0x10FFFF);
156 if (x < 0x80) /* ascii? */
157 buff[UTF8BUFFSZ - 1] = (char) x;
158 else { /* need continuation bytes */
159 unsigned int mfb = 0x3f; /* maximum that fits in first byte */
160 do { /* add continuation bytes */
161 buff[UTF8BUFFSZ - (n++)] = (char) (0x80 | (x & 0x3f));
162 x >>= 6; /* remove added bits */
163 mfb >>= 1; /* now there is one less bit available in first byte */
164 } while (x > mfb); /* still needs continuation byte? */
165 buff[UTF8BUFFSZ - n] = (char) ((~mfb << 1) | x); /* add first byte */
166 }
167 return n;
168}
169
170static void pushutfchar (lua_State *L, int arg) {
171 lua_Integer code = luaL_checkinteger(L, arg);
172 luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");
173
174 /* the %U string format does not exist in lua 5.1 or 5.2, so we emulate it */
175 /* (code from luaO_pushvfstring in lobject.c) */
176 char buff[UTF8BUFFSZ];
177 int l = utf8esc(buff, (long) code);
178 lua_pushlstring(L, buff + UTF8BUFFSZ - l, l);
179}
180
181
182/*
183** utfchar(n1, n2, ...) -> char(n1)..char(n2)...
184*/
185static int utfchar (lua_State *L) {
186 int n = lua_gettop(L); /* number of arguments */
187 if (n == 1) /* optimize common case of single char */
188 pushutfchar(L, 1);
189 else {
190 int i;
191 luaL_Buffer b;
192 luaL_buffinit(L, &b);
193 for (i = 1; i <= n; i++) {
194 pushutfchar(L, i);
195 luaL_addvalue(&b);
196 }
197 luaL_pushresult(&b);
198 }
199 return 1;
200}
201
202
203/*
204** offset(s, n, [i]) -> index where n-th character counting from
205** position 'i' starts; 0 means character at 'i'.
206*/
207static int byteoffset (lua_State *L) {
208 size_t len;
209 const char *s = luaL_checklstring(L, 1, &len);
210 lua_Integer n = luaL_checkinteger(L, 2);
211 lua_Integer posi = (n >= 0) ? 1 : len + 1;
212 posi = u_posrelat(luaL_optinteger(L, 3, posi), len);
213 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
214 "position out of range");
215 if (n == 0) {
216 /* find beginning of current byte sequence */
217 while (posi > 0 && iscont(s + posi)) posi--;
218 }
219 else {
220 if (iscont(s + posi))
221 luaL_error(L, "initial position is a continuation byte");
222 if (n < 0) {
223 while (n < 0 && posi > 0) { /* move back */
224 do { /* find beginning of previous character */
225 posi--;
226 } while (posi > 0 && iscont(s + posi));
227 n++;
228 }
229 }
230 else {
231 n--; /* do not move for 1st character */
232 while (n > 0 && posi < (lua_Integer)len) {
233 do { /* find beginning of next character */
234 posi++;
235 } while (iscont(s + posi)); /* (cannot pass final '\0') */
236 n--;
237 }
238 }
239 }
240 if (n == 0) /* did it find given character? */
241 lua_pushinteger(L, posi + 1);
242 else /* no such character */
243 lua_pushnil(L);
244 return 1;
245}
246
247
248static int iter_aux (lua_State *L) {
249 size_t len;
250 const char *s = luaL_checklstring(L, 1, &len);
251 lua_Integer n = lua_tointeger(L, 2) - 1;
252 if (n < 0) /* first iteration? */
253 n = 0; /* start from here */
254 else if (n < (lua_Integer)len) {
255 n++; /* skip current byte */
256 while (iscont(s + n)) n++; /* and its continuations */
257 }
258 if (n >= (lua_Integer)len)
259 return 0; /* no more codepoints */
260 else {
261 int code;
262 const char *next = utf8_decode(s + n, &code);
263 if (next == NULL || iscont(next))
264 return luaL_error(L, "invalid UTF-8 code");
265 lua_pushinteger(L, n + 1);
266 lua_pushinteger(L, code);
267 return 2;
268 }
269}
270
271
272static int iter_codes (lua_State *L) {
273 luaL_checkstring(L, 1);
274 lua_pushcfunction(L, iter_aux);
275 lua_pushvalue(L, 1);
276 lua_pushinteger(L, 0);
277 return 3;
278}
279
280
281/* pattern to match a single UTF-8 character */
282#if LUA_VERSION_NUM >= 502
283#define UTF8PATT "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
284#else
285/* lua 5.1 doesn't support literal null bytes in patterns */
286#define UTF8PATT "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"
287#endif
288
289
290static struct luaL_Reg funcs[] = {
291 {"offset", byteoffset},
292 {"codepoint", codepoint},
293 {"char", utfchar},
294 {"len", utflen},
295 {"codes", iter_codes},
296 /* placeholders */
297 {"charpattern", NULL},
298 {NULL, NULL}
299};
300
301
302/* modified version of luaopen_utf8, designed to work with lua 5.1-5.3 */
303int luaopen_luautf8 (lua_State *L) {
304 luaL_Reg *l;
305 lua_createtable(L, 0, (int) (sizeof(funcs) / sizeof(luaL_Reg)) - 1);
306 for (l = funcs; l->name != NULL; l++) {
307 if (l->func != NULL) {
308 lua_pushcfunction(L, l->func);
309 lua_setfield(L, -2, l->name);
310 }
311 }
312 lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT) / sizeof(char) - 1);
313 lua_setfield(L, -2, "charpattern");
314 return 1;
315}
316