1#include "api.h"
2
3#define PCRE2_CODE_UNIT_WIDTH 8
4
5#include <string.h>
6#include <pcre2.h>
7#include <stdbool.h>
8
9typedef struct RegexState {
10 pcre2_code* re;
11 pcre2_match_data* match_data;
12 const char* subject;
13 size_t subject_len;
14 size_t offset;
15 bool regex_compiled;
16 bool found;
17} RegexState;
18
19static pcre2_code* regex_get_pattern(lua_State *L, bool* should_free) {
20 pcre2_code* re = NULL;
21 *should_free = false;
22
23 if (lua_type(L, 1) == LUA_TTABLE) {
24 lua_rawgeti(L, 1, 1);
25 re = (pcre2_code*)lua_touserdata(L, -1);
26 lua_settop(L, -2);
27 } else {
28 int errornumber;
29 PCRE2_SIZE erroroffset;
30 size_t pattern_len = 0;
31 const char* pattern = luaL_checklstring(L, 1, &pattern_len);
32
33 re = pcre2_compile(
34 (PCRE2_SPTR)pattern,
35 pattern_len, PCRE2_UTF,
36 &errornumber, &erroroffset, NULL
37 );
38
39 if (re == NULL) {
40 PCRE2_UCHAR errmsg[256];
41 pcre2_get_error_message(errornumber, errmsg, sizeof(errmsg));
42 luaL_error(
43 L, "regex pattern error at offset %d: %s",
44 (int)erroroffset, errmsg
45 );
46 return NULL;
47 }
48
49 pcre2_jit_compile(re, PCRE2_JIT_COMPLETE);
50
51 *should_free = true;
52 }
53
54 return re;
55}
56
57static int regex_gmatch_iterator(lua_State *L) {
58 RegexState *state = (RegexState*)lua_touserdata(L, lua_upvalueindex(3));
59
60 if (state->found) {
61 int rc = pcre2_match(
62 state->re,
63 (PCRE2_SPTR)state->subject, state->subject_len,
64 state->offset, 0, state->match_data, NULL
65 );
66
67 if (rc < 0) {
68 if (rc != PCRE2_ERROR_NOMATCH) {
69 PCRE2_UCHAR buffer[120];
70 pcre2_get_error_message(rc, buffer, sizeof(buffer));
71 luaL_error(L, "regex matching error %d: %s", rc, buffer);
72 }
73 goto clean;
74 } else {
75 size_t ovector_count = pcre2_get_ovector_count(state->match_data);
76 if (ovector_count > 0) {
77 PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(state->match_data);
78 if (ovector[0] > ovector[1]) {
79 /* We must guard against patterns such as /(?=.\K)/ that use \K in an
80 assertion to set the start of a match later than its end. In the editor,
81 we just detect this case and give up. */
82 luaL_error(L, "regex matching error: \\K was used in an assertion to "
83 " set the match start after its end");
84 goto clean;
85 }
86
87 int index = 0;
88 if (ovector_count > 1) index = 2;
89
90 int total = 0;
91 int total_results = ovector_count * 2;
92 size_t last_offset = 0;
93 for (int i = index; i < total_results; i+=2) {
94 if (ovector[i] == ovector[i+1])
95 lua_pushinteger(L, ovector[i] + 1);
96 else
97 lua_pushlstring(L, state->subject+ovector[i], ovector[i+1] - ovector[i]);
98 last_offset = ovector[i+1];
99 total++;
100 }
101
102 if (last_offset - 1 < state->subject_len)
103 state->offset = last_offset;
104 else
105 state->found = false;
106
107 return total;
108 } else {
109 state->found = false;
110 }
111 }
112 }
113
114clean:
115 if (state->regex_compiled) pcre2_code_free(state->re);
116 pcre2_match_data_free(state->match_data);
117
118 return 0; /* not found */
119}
120
121static size_t regex_offset_relative(lua_Integer pos, size_t len) {
122 if (pos > 0)
123 return (size_t)pos;
124 else if (pos == 0)
125 return 1;
126 else if (pos < -(lua_Integer)len) /* inverted comparison */
127 return 1; /* clip to 1 */
128 else return len + (size_t)pos + 1;
129}
130
131static int f_pcre_gc(lua_State* L) {
132 lua_rawgeti(L, -1, 1);
133 pcre2_code* re = (pcre2_code*)lua_touserdata(L, -1);
134 if (re)
135 pcre2_code_free(re);
136 return 0;
137}
138
139static int f_pcre_compile(lua_State *L) {
140 size_t len;
141 PCRE2_SIZE errorOffset;
142 int errorNumber;
143 int pattern = PCRE2_UTF;
144 const char* str = luaL_checklstring(L, 1, &len);
145 if (lua_gettop(L) > 1) {
146 const char* options = luaL_checkstring(L, 2);
147 if (strstr(options,"i"))
148 pattern |= PCRE2_CASELESS;
149 if (strstr(options,"m"))
150 pattern |= PCRE2_MULTILINE;
151 if (strstr(options,"s"))
152 pattern |= PCRE2_DOTALL;
153 }
154 pcre2_code* re = pcre2_compile(
155 (PCRE2_SPTR)str,
156 len,
157 pattern,
158 &errorNumber,
159 &errorOffset,
160 NULL
161 );
162 if (re) {
163 pcre2_jit_compile(re, PCRE2_JIT_COMPLETE);
164 lua_newtable(L);
165 lua_pushlightuserdata(L, re);
166 lua_rawseti(L, -2, 1);
167 luaL_setmetatable(L, "regex");
168 return 1;
169 }
170 PCRE2_UCHAR buffer[256];
171 pcre2_get_error_message(errorNumber, buffer, sizeof(buffer));
172 lua_pushnil(L);
173 char message[1024];
174 len = snprintf(message, sizeof(message), "regex compilation failed at offset %d: %s", (int)errorOffset, buffer);
175 lua_pushlstring(L, message, len);
176 return 2;
177}
178
179// Takes string, compiled regex, returns list of indices of matched groups
180// (including the whole match), if a match was found.
181static int f_pcre_match(lua_State *L) {
182 size_t len, offset = 1, opts = 0;
183 bool regex_compiled = false;
184 pcre2_code* re = regex_get_pattern(L, &regex_compiled);
185 if (!re) return 0 ;
186 const char* str = luaL_checklstring(L, 2, &len);
187 if (lua_gettop(L) > 2)
188 offset = regex_offset_relative(luaL_checknumber(L, 3), len);
189 offset -= 1;
190 len -= offset;
191 if (lua_gettop(L) > 3)
192 opts = luaL_checknumber(L, 4);
193 lua_rawgeti(L, 1, 1);
194 pcre2_match_data* md = pcre2_match_data_create_from_pattern(re, NULL);
195 int rc = pcre2_match(re, (PCRE2_SPTR)&str[offset], len, 0, opts, md, NULL);
196 if (rc < 0) {
197 if (regex_compiled) pcre2_code_free(re);
198 pcre2_match_data_free(md);
199 if (rc != PCRE2_ERROR_NOMATCH) {
200 PCRE2_UCHAR buffer[120];
201 pcre2_get_error_message(rc, buffer, sizeof(buffer));
202 luaL_error(L, "regex matching error %d: %s", rc, buffer);
203 }
204 return 0;
205 }
206 PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(md);
207 if (ovector[0] > ovector[1]) {
208 /* We must guard against patterns such as /(?=.\K)/ that use \K in an
209 assertion to set the start of a match later than its end. In the editor,
210 we just detect this case and give up. */
211 luaL_error(L, "regex matching error: \\K was used in an assertion to "
212 " set the match start after its end");
213 if (regex_compiled) pcre2_code_free(re);
214 pcre2_match_data_free(md);
215 return 0;
216 }
217 for (int i = 0; i < rc*2; i++)
218 lua_pushinteger(L, ovector[i]+offset+1);
219 if (regex_compiled) pcre2_code_free(re);
220 pcre2_match_data_free(md);
221 return rc*2;
222}
223
224static int f_pcre_gmatch(lua_State *L) {
225 /* pattern param */
226 bool regex_compiled = false;
227 pcre2_code* re = regex_get_pattern(L, &regex_compiled);
228 if (!re) return 0;
229 size_t subject_len = 0;
230
231 /* subject param */
232 const char* subject = luaL_checklstring(L, 2, &subject_len);
233
234 /* offset param */
235 size_t offset = regex_offset_relative(
236 luaL_optnumber(L, 3, 1), subject_len
237 ) - 1;
238
239 /* keep strings on closure to avoid being collected */
240 lua_settop(L, 2);
241
242 RegexState *state;
243 state = (RegexState*)lua_newuserdata(L, sizeof(RegexState));
244
245 state->re = re;
246 state->match_data = pcre2_match_data_create_from_pattern(re, NULL);
247 state->subject = subject;
248 state->subject_len = subject_len;
249 state->offset = offset;
250 state->found = true;
251 state->regex_compiled = regex_compiled;
252
253 lua_pushcclosure(L, regex_gmatch_iterator, 3);
254 return 1;
255}
256
257static int f_pcre_gsub(lua_State *L) {
258 size_t subject_len = 0, replacement_len = 0;
259
260 bool regex_compiled = false;
261 pcre2_code* re = regex_get_pattern(L, &regex_compiled);
262 if (!re) return 0 ;
263
264 char* subject = (char*) luaL_checklstring(L, 2, &subject_len);
265 const char* replacement = luaL_checklstring(L, 3, &replacement_len);
266 int limit = luaL_optinteger(L, 4, 0);
267 if (limit < 0 ) limit = 0;
268
269 pcre2_match_data* match_data = pcre2_match_data_create_from_pattern(re, NULL);
270
271 size_t buffer_size = 1024;
272 char *output = (char *)malloc(buffer_size);
273
274 int options = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_EXTENDED;
275 if (limit == 0) options |= PCRE2_SUBSTITUTE_GLOBAL;
276
277 int results_count = 0;
278 int limit_count = 0;
279 bool done = false;
280 size_t offset = 0;
281 PCRE2_SIZE outlen = buffer_size;
282 while (!done) {
283 results_count = pcre2_substitute(
284 re,
285 (PCRE2_SPTR)subject, subject_len,
286 offset, options,
287 match_data, NULL,
288 (PCRE2_SPTR)replacement, replacement_len,
289 (PCRE2_UCHAR*)output, &outlen
290 );
291
292 if (results_count != PCRE2_ERROR_NOMEMORY || buffer_size >= outlen) {
293 /* PCRE2_SUBSTITUTE_GLOBAL code path (fastest) */
294 if(limit == 0) {
295 done = true;
296 /* non PCRE2_SUBSTITUTE_GLOBAL with limit code path (slower) */
297 } else {
298 size_t ovector_count = pcre2_get_ovector_count(match_data);
299 if (results_count > 0 && ovector_count > 0) {
300 limit_count++;
301 PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(match_data);
302 if (outlen > subject_len) {
303 offset = ovector[1] + (outlen - subject_len);
304 } else {
305 offset = ovector[1] - (subject_len - outlen);
306 }
307 if (limit_count > 1) free(subject);
308 if (limit_count == limit || offset-1 == outlen) {
309 done = true;
310 results_count = limit_count;
311 } else {
312 subject = output;
313 subject_len = outlen;
314 output = (char *)malloc(buffer_size);
315 outlen = buffer_size;
316 }
317 } else {
318 if (limit_count > 1) {
319 free(subject);
320 }
321 done = true;
322 results_count = limit_count;
323 }
324 }
325 } else {
326 buffer_size = outlen;
327 output = (char *)realloc(output, buffer_size);
328 }
329 }
330
331 int return_count = 0;
332
333 if (results_count > 0) {
334 lua_pushlstring(L, (const char*) output, outlen);
335 lua_pushinteger(L, results_count);
336 return_count = 2;
337 } else if (results_count == 0) {
338 lua_pushlstring(L, subject, subject_len);
339 lua_pushinteger(L, 0);
340 return_count = 2;
341 }
342
343 free(output);
344 pcre2_match_data_free(match_data);
345 if (regex_compiled)
346 pcre2_code_free(re);
347
348 if (results_count < 0) {
349 PCRE2_UCHAR errmsg[256];
350 pcre2_get_error_message(results_count, errmsg, sizeof(errmsg));
351 return luaL_error(L, "regex substitute error: %s", errmsg);
352 }
353
354 return return_count;
355}
356
357static const luaL_Reg lib[] = {
358 { "compile", f_pcre_compile },
359 { "cmatch", f_pcre_match },
360 { "gmatch", f_pcre_gmatch },
361 { "gsub", f_pcre_gsub },
362 { "__gc", f_pcre_gc },
363 { NULL, NULL }
364};
365
366int luaopen_regex(lua_State *L) {
367 luaL_newlib(L, lib);
368 lua_pushliteral(L, "regex");
369 lua_setfield(L, -2, "__name");
370 lua_pushvalue(L, -1);
371 lua_setfield(L, LUA_REGISTRYINDEX, "regex");
372 lua_pushinteger(L, PCRE2_ANCHORED);
373 lua_setfield(L, -2, "ANCHORED");
374 lua_pushinteger(L, PCRE2_ANCHORED) ;
375 lua_setfield(L, -2, "ENDANCHORED");
376 lua_pushinteger(L, PCRE2_NOTBOL);
377 lua_setfield(L, -2, "NOTBOL");
378 lua_pushinteger(L, PCRE2_NOTEOL);
379 lua_setfield(L, -2, "NOTEOL");
380 lua_pushinteger(L, PCRE2_NOTEMPTY);
381 lua_setfield(L, -2, "NOTEMPTY");
382 lua_pushinteger(L, PCRE2_NOTEMPTY_ATSTART);
383 lua_setfield(L, -2, "NOTEMPTY_ATSTART");
384 return 1;
385}
386