1 | /**************************************************************************/ |
2 | /* regex.cpp */ |
3 | /**************************************************************************/ |
4 | /* This file is part of: */ |
5 | /* GODOT ENGINE */ |
6 | /* https://godotengine.org */ |
7 | /**************************************************************************/ |
8 | /* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ |
9 | /* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ |
10 | /* */ |
11 | /* Permission is hereby granted, free of charge, to any person obtaining */ |
12 | /* a copy of this software and associated documentation files (the */ |
13 | /* "Software"), to deal in the Software without restriction, including */ |
14 | /* without limitation the rights to use, copy, modify, merge, publish, */ |
15 | /* distribute, sublicense, and/or sell copies of the Software, and to */ |
16 | /* permit persons to whom the Software is furnished to do so, subject to */ |
17 | /* the following conditions: */ |
18 | /* */ |
19 | /* The above copyright notice and this permission notice shall be */ |
20 | /* included in all copies or substantial portions of the Software. */ |
21 | /* */ |
22 | /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ |
23 | /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ |
24 | /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ |
25 | /* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ |
26 | /* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ |
27 | /* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ |
28 | /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ |
29 | /**************************************************************************/ |
30 | |
31 | #include "regex.h" |
32 | |
33 | #include "core/os/memory.h" |
34 | |
35 | extern "C" { |
36 | #include <pcre2.h> |
37 | } |
38 | |
39 | static void *_regex_malloc(PCRE2_SIZE size, void *user) { |
40 | return memalloc(size); |
41 | } |
42 | |
43 | static void _regex_free(void *ptr, void *user) { |
44 | if (ptr) { |
45 | memfree(ptr); |
46 | } |
47 | } |
48 | |
49 | int RegExMatch::_find(const Variant &p_name) const { |
50 | if (p_name.is_num()) { |
51 | int i = (int)p_name; |
52 | if (i >= data.size()) { |
53 | return -1; |
54 | } |
55 | return i; |
56 | } else if (p_name.get_type() == Variant::STRING || p_name.get_type() == Variant::STRING_NAME) { |
57 | HashMap<String, int>::ConstIterator found = names.find((String)p_name); |
58 | if (found) { |
59 | return found->value; |
60 | } |
61 | } |
62 | |
63 | return -1; |
64 | } |
65 | |
66 | String RegExMatch::get_subject() const { |
67 | return subject; |
68 | } |
69 | |
70 | int RegExMatch::get_group_count() const { |
71 | if (data.size() == 0) { |
72 | return 0; |
73 | } |
74 | return data.size() - 1; |
75 | } |
76 | |
77 | Dictionary RegExMatch::get_names() const { |
78 | Dictionary result; |
79 | |
80 | for (const KeyValue<String, int> &E : names) { |
81 | result[E.key] = E.value; |
82 | } |
83 | |
84 | return result; |
85 | } |
86 | |
87 | PackedStringArray RegExMatch::get_strings() const { |
88 | PackedStringArray result; |
89 | |
90 | int size = data.size(); |
91 | |
92 | for (int i = 0; i < size; i++) { |
93 | int start = data[i].start; |
94 | |
95 | if (start == -1) { |
96 | result.append(String()); |
97 | continue; |
98 | } |
99 | |
100 | int length = data[i].end - start; |
101 | |
102 | result.append(subject.substr(start, length)); |
103 | } |
104 | |
105 | return result; |
106 | } |
107 | |
108 | String RegExMatch::get_string(const Variant &p_name) const { |
109 | int id = _find(p_name); |
110 | |
111 | if (id < 0) { |
112 | return String(); |
113 | } |
114 | |
115 | int start = data[id].start; |
116 | |
117 | if (start == -1) { |
118 | return String(); |
119 | } |
120 | |
121 | int length = data[id].end - start; |
122 | |
123 | return subject.substr(start, length); |
124 | } |
125 | |
126 | int RegExMatch::get_start(const Variant &p_name) const { |
127 | int id = _find(p_name); |
128 | |
129 | if (id < 0) { |
130 | return -1; |
131 | } |
132 | |
133 | return data[id].start; |
134 | } |
135 | |
136 | int RegExMatch::get_end(const Variant &p_name) const { |
137 | int id = _find(p_name); |
138 | |
139 | if (id < 0) { |
140 | return -1; |
141 | } |
142 | |
143 | return data[id].end; |
144 | } |
145 | |
146 | void RegExMatch::_bind_methods() { |
147 | ClassDB::bind_method(D_METHOD("get_subject" ), &RegExMatch::get_subject); |
148 | ClassDB::bind_method(D_METHOD("get_group_count" ), &RegExMatch::get_group_count); |
149 | ClassDB::bind_method(D_METHOD("get_names" ), &RegExMatch::get_names); |
150 | ClassDB::bind_method(D_METHOD("get_strings" ), &RegExMatch::get_strings); |
151 | ClassDB::bind_method(D_METHOD("get_string" , "name" ), &RegExMatch::get_string, DEFVAL(0)); |
152 | ClassDB::bind_method(D_METHOD("get_start" , "name" ), &RegExMatch::get_start, DEFVAL(0)); |
153 | ClassDB::bind_method(D_METHOD("get_end" , "name" ), &RegExMatch::get_end, DEFVAL(0)); |
154 | |
155 | ADD_PROPERTY(PropertyInfo(Variant::STRING, "subject" ), "" , "get_subject" ); |
156 | ADD_PROPERTY(PropertyInfo(Variant::DICTIONARY, "names" ), "" , "get_names" ); |
157 | ADD_PROPERTY(PropertyInfo(Variant::ARRAY, "strings" ), "" , "get_strings" ); |
158 | } |
159 | |
160 | void RegEx::_pattern_info(uint32_t what, void *where) const { |
161 | pcre2_pattern_info_32((pcre2_code_32 *)code, what, where); |
162 | } |
163 | |
164 | Ref<RegEx> RegEx::create_from_string(const String &p_pattern) { |
165 | Ref<RegEx> ret; |
166 | ret.instantiate(); |
167 | ret->compile(p_pattern); |
168 | return ret; |
169 | } |
170 | |
171 | void RegEx::clear() { |
172 | if (code) { |
173 | pcre2_code_free_32((pcre2_code_32 *)code); |
174 | code = nullptr; |
175 | } |
176 | } |
177 | |
178 | Error RegEx::compile(const String &p_pattern) { |
179 | pattern = p_pattern; |
180 | clear(); |
181 | |
182 | int err; |
183 | PCRE2_SIZE offset; |
184 | uint32_t flags = PCRE2_DUPNAMES; |
185 | |
186 | pcre2_general_context_32 *gctx = (pcre2_general_context_32 *)general_ctx; |
187 | pcre2_compile_context_32 *cctx = pcre2_compile_context_create_32(gctx); |
188 | PCRE2_SPTR32 p = (PCRE2_SPTR32)pattern.get_data(); |
189 | |
190 | code = pcre2_compile_32(p, pattern.length(), flags, &err, &offset, cctx); |
191 | |
192 | pcre2_compile_context_free_32(cctx); |
193 | |
194 | if (!code) { |
195 | PCRE2_UCHAR32 buf[256]; |
196 | pcre2_get_error_message_32(err, buf, 256); |
197 | String message = String::num(offset) + ": " + String((const char32_t *)buf); |
198 | ERR_PRINT(message.utf8()); |
199 | return FAILED; |
200 | } |
201 | return OK; |
202 | } |
203 | |
204 | Ref<RegExMatch> RegEx::search(const String &p_subject, int p_offset, int p_end) const { |
205 | ERR_FAIL_COND_V(!is_valid(), nullptr); |
206 | ERR_FAIL_COND_V_MSG(p_offset < 0, nullptr, "RegEx search offset must be >= 0" ); |
207 | |
208 | Ref<RegExMatch> result = memnew(RegExMatch); |
209 | |
210 | int length = p_subject.length(); |
211 | if (p_end >= 0 && p_end < length) { |
212 | length = p_end; |
213 | } |
214 | |
215 | pcre2_code_32 *c = (pcre2_code_32 *)code; |
216 | pcre2_general_context_32 *gctx = (pcre2_general_context_32 *)general_ctx; |
217 | pcre2_match_context_32 *mctx = pcre2_match_context_create_32(gctx); |
218 | PCRE2_SPTR32 s = (PCRE2_SPTR32)p_subject.get_data(); |
219 | |
220 | pcre2_match_data_32 *match = pcre2_match_data_create_from_pattern_32(c, gctx); |
221 | |
222 | int res = pcre2_match_32(c, s, length, p_offset, 0, match, mctx); |
223 | |
224 | if (res < 0) { |
225 | pcre2_match_data_free_32(match); |
226 | pcre2_match_context_free_32(mctx); |
227 | |
228 | return nullptr; |
229 | } |
230 | |
231 | uint32_t size = pcre2_get_ovector_count_32(match); |
232 | PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_32(match); |
233 | |
234 | result->data.resize(size); |
235 | |
236 | for (uint32_t i = 0; i < size; i++) { |
237 | result->data.write[i].start = ovector[i * 2]; |
238 | result->data.write[i].end = ovector[i * 2 + 1]; |
239 | } |
240 | |
241 | pcre2_match_data_free_32(match); |
242 | pcre2_match_context_free_32(mctx); |
243 | |
244 | result->subject = p_subject; |
245 | |
246 | uint32_t count; |
247 | const char32_t *table; |
248 | uint32_t entry_size; |
249 | |
250 | _pattern_info(PCRE2_INFO_NAMECOUNT, &count); |
251 | _pattern_info(PCRE2_INFO_NAMETABLE, &table); |
252 | _pattern_info(PCRE2_INFO_NAMEENTRYSIZE, &entry_size); |
253 | |
254 | for (uint32_t i = 0; i < count; i++) { |
255 | char32_t id = table[i * entry_size]; |
256 | if (result->data[id].start == -1) { |
257 | continue; |
258 | } |
259 | String name = &table[i * entry_size + 1]; |
260 | if (result->names.has(name)) { |
261 | continue; |
262 | } |
263 | |
264 | result->names.insert(name, id); |
265 | } |
266 | |
267 | return result; |
268 | } |
269 | |
270 | TypedArray<RegExMatch> RegEx::search_all(const String &p_subject, int p_offset, int p_end) const { |
271 | ERR_FAIL_COND_V_MSG(p_offset < 0, Array(), "RegEx search offset must be >= 0" ); |
272 | |
273 | int last_end = -1; |
274 | TypedArray<RegExMatch> result; |
275 | Ref<RegExMatch> match = search(p_subject, p_offset, p_end); |
276 | while (match.is_valid()) { |
277 | if (last_end == match->get_end(0)) { |
278 | break; |
279 | } |
280 | result.push_back(match); |
281 | last_end = match->get_end(0); |
282 | match = search(p_subject, match->get_end(0), p_end); |
283 | } |
284 | return result; |
285 | } |
286 | |
287 | String RegEx::sub(const String &p_subject, const String &p_replacement, bool p_all, int p_offset, int p_end) const { |
288 | ERR_FAIL_COND_V(!is_valid(), String()); |
289 | ERR_FAIL_COND_V_MSG(p_offset < 0, String(), "RegEx sub offset must be >= 0" ); |
290 | |
291 | // safety_zone is the number of chars we allocate in addition to the number of chars expected in order to |
292 | // guard against the PCRE API writing one additional \0 at the end. PCRE's API docs are unclear on whether |
293 | // PCRE understands outlength in pcre2_substitute() as counting an implicit additional terminating char or |
294 | // not. always allocating one char more than telling PCRE has us on the safe side. |
295 | const int safety_zone = 1; |
296 | |
297 | PCRE2_SIZE olength = p_subject.length() + 1; // space for output string and one terminating \0 character |
298 | Vector<char32_t> output; |
299 | output.resize(olength + safety_zone); |
300 | |
301 | uint32_t flags = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH; |
302 | if (p_all) { |
303 | flags |= PCRE2_SUBSTITUTE_GLOBAL; |
304 | } |
305 | |
306 | PCRE2_SIZE length = p_subject.length(); |
307 | if (p_end >= 0 && (uint32_t)p_end < length) { |
308 | length = p_end; |
309 | } |
310 | |
311 | pcre2_code_32 *c = (pcre2_code_32 *)code; |
312 | pcre2_general_context_32 *gctx = (pcre2_general_context_32 *)general_ctx; |
313 | pcre2_match_context_32 *mctx = pcre2_match_context_create_32(gctx); |
314 | PCRE2_SPTR32 s = (PCRE2_SPTR32)p_subject.get_data(); |
315 | PCRE2_SPTR32 r = (PCRE2_SPTR32)p_replacement.get_data(); |
316 | PCRE2_UCHAR32 *o = (PCRE2_UCHAR32 *)output.ptrw(); |
317 | |
318 | pcre2_match_data_32 *match = pcre2_match_data_create_from_pattern_32(c, gctx); |
319 | |
320 | int res = pcre2_substitute_32(c, s, length, p_offset, flags, match, mctx, r, p_replacement.length(), o, &olength); |
321 | |
322 | if (res == PCRE2_ERROR_NOMEMORY) { |
323 | output.resize(olength + safety_zone); |
324 | o = (PCRE2_UCHAR32 *)output.ptrw(); |
325 | res = pcre2_substitute_32(c, s, length, p_offset, flags, match, mctx, r, p_replacement.length(), o, &olength); |
326 | } |
327 | |
328 | pcre2_match_data_free_32(match); |
329 | pcre2_match_context_free_32(mctx); |
330 | |
331 | if (res < 0) { |
332 | return String(); |
333 | } |
334 | |
335 | return String(output.ptr(), olength); |
336 | } |
337 | |
338 | bool RegEx::is_valid() const { |
339 | return (code != nullptr); |
340 | } |
341 | |
342 | String RegEx::get_pattern() const { |
343 | return pattern; |
344 | } |
345 | |
346 | int RegEx::get_group_count() const { |
347 | ERR_FAIL_COND_V(!is_valid(), 0); |
348 | |
349 | uint32_t count; |
350 | |
351 | _pattern_info(PCRE2_INFO_CAPTURECOUNT, &count); |
352 | |
353 | return count; |
354 | } |
355 | |
356 | PackedStringArray RegEx::get_names() const { |
357 | PackedStringArray result; |
358 | |
359 | ERR_FAIL_COND_V(!is_valid(), result); |
360 | |
361 | uint32_t count; |
362 | const char32_t *table; |
363 | uint32_t entry_size; |
364 | |
365 | _pattern_info(PCRE2_INFO_NAMECOUNT, &count); |
366 | _pattern_info(PCRE2_INFO_NAMETABLE, &table); |
367 | _pattern_info(PCRE2_INFO_NAMEENTRYSIZE, &entry_size); |
368 | |
369 | for (uint32_t i = 0; i < count; i++) { |
370 | String name = &table[i * entry_size + 1]; |
371 | if (result.find(name) < 0) { |
372 | result.append(name); |
373 | } |
374 | } |
375 | |
376 | return result; |
377 | } |
378 | |
379 | RegEx::RegEx() { |
380 | general_ctx = pcre2_general_context_create_32(&_regex_malloc, &_regex_free, nullptr); |
381 | } |
382 | |
383 | RegEx::RegEx(const String &p_pattern) { |
384 | general_ctx = pcre2_general_context_create_32(&_regex_malloc, &_regex_free, nullptr); |
385 | compile(p_pattern); |
386 | } |
387 | |
388 | RegEx::~RegEx() { |
389 | if (code) { |
390 | pcre2_code_free_32((pcre2_code_32 *)code); |
391 | } |
392 | pcre2_general_context_free_32((pcre2_general_context_32 *)general_ctx); |
393 | } |
394 | |
395 | void RegEx::_bind_methods() { |
396 | ClassDB::bind_static_method("RegEx" , D_METHOD("create_from_string" , "pattern" ), &RegEx::create_from_string); |
397 | |
398 | ClassDB::bind_method(D_METHOD("clear" ), &RegEx::clear); |
399 | ClassDB::bind_method(D_METHOD("compile" , "pattern" ), &RegEx::compile); |
400 | ClassDB::bind_method(D_METHOD("search" , "subject" , "offset" , "end" ), &RegEx::search, DEFVAL(0), DEFVAL(-1)); |
401 | ClassDB::bind_method(D_METHOD("search_all" , "subject" , "offset" , "end" ), &RegEx::search_all, DEFVAL(0), DEFVAL(-1)); |
402 | ClassDB::bind_method(D_METHOD("sub" , "subject" , "replacement" , "all" , "offset" , "end" ), &RegEx::sub, DEFVAL(false), DEFVAL(0), DEFVAL(-1)); |
403 | ClassDB::bind_method(D_METHOD("is_valid" ), &RegEx::is_valid); |
404 | ClassDB::bind_method(D_METHOD("get_pattern" ), &RegEx::get_pattern); |
405 | ClassDB::bind_method(D_METHOD("get_group_count" ), &RegEx::get_group_count); |
406 | ClassDB::bind_method(D_METHOD("get_names" ), &RegEx::get_names); |
407 | } |
408 | |