1 | #include "api.h" |
2 | |
3 | #define PCRE2_CODE_UNIT_WIDTH 8 |
4 | |
5 | #include <string.h> |
6 | #include <pcre2.h> |
7 | #include <stdbool.h> |
8 | |
9 | typedef struct RegexState { |
10 | pcre2_code* re; |
11 | pcre2_match_data* match_data; |
12 | const char* subject; |
13 | size_t subject_len; |
14 | size_t offset; |
15 | bool regex_compiled; |
16 | bool found; |
17 | } RegexState; |
18 | |
19 | static pcre2_code* regex_get_pattern(lua_State *L, bool* should_free) { |
20 | pcre2_code* re = NULL; |
21 | *should_free = false; |
22 | |
23 | if (lua_type(L, 1) == LUA_TTABLE) { |
24 | lua_rawgeti(L, 1, 1); |
25 | re = (pcre2_code*)lua_touserdata(L, -1); |
26 | lua_settop(L, -2); |
27 | } else { |
28 | int errornumber; |
29 | PCRE2_SIZE erroroffset; |
30 | size_t pattern_len = 0; |
31 | const char* pattern = luaL_checklstring(L, 1, &pattern_len); |
32 | |
33 | re = pcre2_compile( |
34 | (PCRE2_SPTR)pattern, |
35 | pattern_len, PCRE2_UTF, |
36 | &errornumber, &erroroffset, NULL |
37 | ); |
38 | |
39 | if (re == NULL) { |
40 | PCRE2_UCHAR errmsg[256]; |
41 | pcre2_get_error_message(errornumber, errmsg, sizeof(errmsg)); |
42 | luaL_error( |
43 | L, "regex pattern error at offset %d: %s" , |
44 | (int)erroroffset, errmsg |
45 | ); |
46 | return NULL; |
47 | } |
48 | |
49 | pcre2_jit_compile(re, PCRE2_JIT_COMPLETE); |
50 | |
51 | *should_free = true; |
52 | } |
53 | |
54 | return re; |
55 | } |
56 | |
57 | static int regex_gmatch_iterator(lua_State *L) { |
58 | RegexState *state = (RegexState*)lua_touserdata(L, lua_upvalueindex(3)); |
59 | |
60 | if (state->found) { |
61 | int rc = pcre2_match( |
62 | state->re, |
63 | (PCRE2_SPTR)state->subject, state->subject_len, |
64 | state->offset, 0, state->match_data, NULL |
65 | ); |
66 | |
67 | if (rc < 0) { |
68 | if (rc != PCRE2_ERROR_NOMATCH) { |
69 | PCRE2_UCHAR buffer[120]; |
70 | pcre2_get_error_message(rc, buffer, sizeof(buffer)); |
71 | luaL_error(L, "regex matching error %d: %s" , rc, buffer); |
72 | } |
73 | goto clean; |
74 | } else { |
75 | size_t ovector_count = pcre2_get_ovector_count(state->match_data); |
76 | if (ovector_count > 0) { |
77 | PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(state->match_data); |
78 | if (ovector[0] > ovector[1]) { |
79 | /* We must guard against patterns such as /(?=.\K)/ that use \K in an |
80 | assertion to set the start of a match later than its end. In the editor, |
81 | we just detect this case and give up. */ |
82 | luaL_error(L, "regex matching error: \\K was used in an assertion to " |
83 | " set the match start after its end" ); |
84 | goto clean; |
85 | } |
86 | |
87 | int index = 0; |
88 | if (ovector_count > 1) index = 2; |
89 | |
90 | int total = 0; |
91 | int total_results = ovector_count * 2; |
92 | size_t last_offset = 0; |
93 | for (int i = index; i < total_results; i+=2) { |
94 | if (ovector[i] == ovector[i+1]) |
95 | lua_pushinteger(L, ovector[i] + 1); |
96 | else |
97 | lua_pushlstring(L, state->subject+ovector[i], ovector[i+1] - ovector[i]); |
98 | last_offset = ovector[i+1]; |
99 | total++; |
100 | } |
101 | |
102 | if (last_offset - 1 < state->subject_len) |
103 | state->offset = last_offset; |
104 | else |
105 | state->found = false; |
106 | |
107 | return total; |
108 | } else { |
109 | state->found = false; |
110 | } |
111 | } |
112 | } |
113 | |
114 | clean: |
115 | if (state->regex_compiled) pcre2_code_free(state->re); |
116 | pcre2_match_data_free(state->match_data); |
117 | |
118 | return 0; /* not found */ |
119 | } |
120 | |
121 | static size_t regex_offset_relative(lua_Integer pos, size_t len) { |
122 | if (pos > 0) |
123 | return (size_t)pos; |
124 | else if (pos == 0) |
125 | return 1; |
126 | else if (pos < -(lua_Integer)len) /* inverted comparison */ |
127 | return 1; /* clip to 1 */ |
128 | else return len + (size_t)pos + 1; |
129 | } |
130 | |
131 | static int f_pcre_gc(lua_State* L) { |
132 | lua_rawgeti(L, -1, 1); |
133 | pcre2_code* re = (pcre2_code*)lua_touserdata(L, -1); |
134 | if (re) |
135 | pcre2_code_free(re); |
136 | return 0; |
137 | } |
138 | |
139 | static int f_pcre_compile(lua_State *L) { |
140 | size_t len; |
141 | PCRE2_SIZE errorOffset; |
142 | int errorNumber; |
143 | int pattern = PCRE2_UTF; |
144 | const char* str = luaL_checklstring(L, 1, &len); |
145 | if (lua_gettop(L) > 1) { |
146 | const char* options = luaL_checkstring(L, 2); |
147 | if (strstr(options,"i" )) |
148 | pattern |= PCRE2_CASELESS; |
149 | if (strstr(options,"m" )) |
150 | pattern |= PCRE2_MULTILINE; |
151 | if (strstr(options,"s" )) |
152 | pattern |= PCRE2_DOTALL; |
153 | } |
154 | pcre2_code* re = pcre2_compile( |
155 | (PCRE2_SPTR)str, |
156 | len, |
157 | pattern, |
158 | &errorNumber, |
159 | &errorOffset, |
160 | NULL |
161 | ); |
162 | if (re) { |
163 | pcre2_jit_compile(re, PCRE2_JIT_COMPLETE); |
164 | lua_newtable(L); |
165 | lua_pushlightuserdata(L, re); |
166 | lua_rawseti(L, -2, 1); |
167 | luaL_setmetatable(L, "regex" ); |
168 | return 1; |
169 | } |
170 | PCRE2_UCHAR buffer[256]; |
171 | pcre2_get_error_message(errorNumber, buffer, sizeof(buffer)); |
172 | lua_pushnil(L); |
173 | char message[1024]; |
174 | len = snprintf(message, sizeof(message), "regex compilation failed at offset %d: %s" , (int)errorOffset, buffer); |
175 | lua_pushlstring(L, message, len); |
176 | return 2; |
177 | } |
178 | |
179 | // Takes string, compiled regex, returns list of indices of matched groups |
180 | // (including the whole match), if a match was found. |
181 | static int f_pcre_match(lua_State *L) { |
182 | size_t len, offset = 1, opts = 0; |
183 | bool regex_compiled = false; |
184 | pcre2_code* re = regex_get_pattern(L, ®ex_compiled); |
185 | if (!re) return 0 ; |
186 | const char* str = luaL_checklstring(L, 2, &len); |
187 | if (lua_gettop(L) > 2) |
188 | offset = regex_offset_relative(luaL_checknumber(L, 3), len); |
189 | offset -= 1; |
190 | len -= offset; |
191 | if (lua_gettop(L) > 3) |
192 | opts = luaL_checknumber(L, 4); |
193 | lua_rawgeti(L, 1, 1); |
194 | pcre2_match_data* md = pcre2_match_data_create_from_pattern(re, NULL); |
195 | int rc = pcre2_match(re, (PCRE2_SPTR)&str[offset], len, 0, opts, md, NULL); |
196 | if (rc < 0) { |
197 | if (regex_compiled) pcre2_code_free(re); |
198 | pcre2_match_data_free(md); |
199 | if (rc != PCRE2_ERROR_NOMATCH) { |
200 | PCRE2_UCHAR buffer[120]; |
201 | pcre2_get_error_message(rc, buffer, sizeof(buffer)); |
202 | luaL_error(L, "regex matching error %d: %s" , rc, buffer); |
203 | } |
204 | return 0; |
205 | } |
206 | PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(md); |
207 | if (ovector[0] > ovector[1]) { |
208 | /* We must guard against patterns such as /(?=.\K)/ that use \K in an |
209 | assertion to set the start of a match later than its end. In the editor, |
210 | we just detect this case and give up. */ |
211 | luaL_error(L, "regex matching error: \\K was used in an assertion to " |
212 | " set the match start after its end" ); |
213 | if (regex_compiled) pcre2_code_free(re); |
214 | pcre2_match_data_free(md); |
215 | return 0; |
216 | } |
217 | for (int i = 0; i < rc*2; i++) |
218 | lua_pushinteger(L, ovector[i]+offset+1); |
219 | if (regex_compiled) pcre2_code_free(re); |
220 | pcre2_match_data_free(md); |
221 | return rc*2; |
222 | } |
223 | |
224 | static int f_pcre_gmatch(lua_State *L) { |
225 | /* pattern param */ |
226 | bool regex_compiled = false; |
227 | pcre2_code* re = regex_get_pattern(L, ®ex_compiled); |
228 | if (!re) return 0; |
229 | size_t subject_len = 0; |
230 | |
231 | /* subject param */ |
232 | const char* subject = luaL_checklstring(L, 2, &subject_len); |
233 | |
234 | /* offset param */ |
235 | size_t offset = regex_offset_relative( |
236 | luaL_optnumber(L, 3, 1), subject_len |
237 | ) - 1; |
238 | |
239 | /* keep strings on closure to avoid being collected */ |
240 | lua_settop(L, 2); |
241 | |
242 | RegexState *state; |
243 | state = (RegexState*)lua_newuserdata(L, sizeof(RegexState)); |
244 | |
245 | state->re = re; |
246 | state->match_data = pcre2_match_data_create_from_pattern(re, NULL); |
247 | state->subject = subject; |
248 | state->subject_len = subject_len; |
249 | state->offset = offset; |
250 | state->found = true; |
251 | state->regex_compiled = regex_compiled; |
252 | |
253 | lua_pushcclosure(L, regex_gmatch_iterator, 3); |
254 | return 1; |
255 | } |
256 | |
257 | static int f_pcre_gsub(lua_State *L) { |
258 | size_t subject_len = 0, replacement_len = 0; |
259 | |
260 | bool regex_compiled = false; |
261 | pcre2_code* re = regex_get_pattern(L, ®ex_compiled); |
262 | if (!re) return 0 ; |
263 | |
264 | char* subject = (char*) luaL_checklstring(L, 2, &subject_len); |
265 | const char* replacement = luaL_checklstring(L, 3, &replacement_len); |
266 | int limit = luaL_optinteger(L, 4, 0); |
267 | if (limit < 0 ) limit = 0; |
268 | |
269 | pcre2_match_data* match_data = pcre2_match_data_create_from_pattern(re, NULL); |
270 | |
271 | size_t buffer_size = 1024; |
272 | char *output = (char *)malloc(buffer_size); |
273 | |
274 | int options = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_EXTENDED; |
275 | if (limit == 0) options |= PCRE2_SUBSTITUTE_GLOBAL; |
276 | |
277 | int results_count = 0; |
278 | int limit_count = 0; |
279 | bool done = false; |
280 | size_t offset = 0; |
281 | PCRE2_SIZE outlen = buffer_size; |
282 | while (!done) { |
283 | results_count = pcre2_substitute( |
284 | re, |
285 | (PCRE2_SPTR)subject, subject_len, |
286 | offset, options, |
287 | match_data, NULL, |
288 | (PCRE2_SPTR)replacement, replacement_len, |
289 | (PCRE2_UCHAR*)output, &outlen |
290 | ); |
291 | |
292 | if (results_count != PCRE2_ERROR_NOMEMORY || buffer_size >= outlen) { |
293 | /* PCRE2_SUBSTITUTE_GLOBAL code path (fastest) */ |
294 | if(limit == 0) { |
295 | done = true; |
296 | /* non PCRE2_SUBSTITUTE_GLOBAL with limit code path (slower) */ |
297 | } else { |
298 | size_t ovector_count = pcre2_get_ovector_count(match_data); |
299 | if (results_count > 0 && ovector_count > 0) { |
300 | limit_count++; |
301 | PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(match_data); |
302 | if (outlen > subject_len) { |
303 | offset = ovector[1] + (outlen - subject_len); |
304 | } else { |
305 | offset = ovector[1] - (subject_len - outlen); |
306 | } |
307 | if (limit_count > 1) free(subject); |
308 | if (limit_count == limit || offset-1 == outlen) { |
309 | done = true; |
310 | results_count = limit_count; |
311 | } else { |
312 | subject = output; |
313 | subject_len = outlen; |
314 | output = (char *)malloc(buffer_size); |
315 | outlen = buffer_size; |
316 | } |
317 | } else { |
318 | if (limit_count > 1) { |
319 | free(subject); |
320 | } |
321 | done = true; |
322 | results_count = limit_count; |
323 | } |
324 | } |
325 | } else { |
326 | buffer_size = outlen; |
327 | output = (char *)realloc(output, buffer_size); |
328 | } |
329 | } |
330 | |
331 | int return_count = 0; |
332 | |
333 | if (results_count > 0) { |
334 | lua_pushlstring(L, (const char*) output, outlen); |
335 | lua_pushinteger(L, results_count); |
336 | return_count = 2; |
337 | } else if (results_count == 0) { |
338 | lua_pushlstring(L, subject, subject_len); |
339 | lua_pushinteger(L, 0); |
340 | return_count = 2; |
341 | } |
342 | |
343 | free(output); |
344 | pcre2_match_data_free(match_data); |
345 | if (regex_compiled) |
346 | pcre2_code_free(re); |
347 | |
348 | if (results_count < 0) { |
349 | PCRE2_UCHAR errmsg[256]; |
350 | pcre2_get_error_message(results_count, errmsg, sizeof(errmsg)); |
351 | return luaL_error(L, "regex substitute error: %s" , errmsg); |
352 | } |
353 | |
354 | return return_count; |
355 | } |
356 | |
357 | static const luaL_Reg lib[] = { |
358 | { "compile" , f_pcre_compile }, |
359 | { "cmatch" , f_pcre_match }, |
360 | { "gmatch" , f_pcre_gmatch }, |
361 | { "gsub" , f_pcre_gsub }, |
362 | { "__gc" , f_pcre_gc }, |
363 | { NULL, NULL } |
364 | }; |
365 | |
366 | int luaopen_regex(lua_State *L) { |
367 | luaL_newlib(L, lib); |
368 | lua_pushliteral(L, "regex" ); |
369 | lua_setfield(L, -2, "__name" ); |
370 | lua_pushvalue(L, -1); |
371 | lua_setfield(L, LUA_REGISTRYINDEX, "regex" ); |
372 | lua_pushinteger(L, PCRE2_ANCHORED); |
373 | lua_setfield(L, -2, "ANCHORED" ); |
374 | lua_pushinteger(L, PCRE2_ANCHORED) ; |
375 | lua_setfield(L, -2, "ENDANCHORED" ); |
376 | lua_pushinteger(L, PCRE2_NOTBOL); |
377 | lua_setfield(L, -2, "NOTBOL" ); |
378 | lua_pushinteger(L, PCRE2_NOTEOL); |
379 | lua_setfield(L, -2, "NOTEOL" ); |
380 | lua_pushinteger(L, PCRE2_NOTEMPTY); |
381 | lua_setfield(L, -2, "NOTEMPTY" ); |
382 | lua_pushinteger(L, PCRE2_NOTEMPTY_ATSTART); |
383 | lua_setfield(L, -2, "NOTEMPTY_ATSTART" ); |
384 | return 1; |
385 | } |
386 | |