1/**************************************************************************/
2/* regex.cpp */
3/**************************************************************************/
4/* This file is part of: */
5/* GODOT ENGINE */
6/* https://godotengine.org */
7/**************************************************************************/
8/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
9/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
10/* */
11/* Permission is hereby granted, free of charge, to any person obtaining */
12/* a copy of this software and associated documentation files (the */
13/* "Software"), to deal in the Software without restriction, including */
14/* without limitation the rights to use, copy, modify, merge, publish, */
15/* distribute, sublicense, and/or sell copies of the Software, and to */
16/* permit persons to whom the Software is furnished to do so, subject to */
17/* the following conditions: */
18/* */
19/* The above copyright notice and this permission notice shall be */
20/* included in all copies or substantial portions of the Software. */
21/* */
22/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
23/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
24/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
25/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
26/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
27/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
28/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
29/**************************************************************************/
30
31#include "regex.h"
32
33#include "core/os/memory.h"
34
35extern "C" {
36#include <pcre2.h>
37}
38
39static void *_regex_malloc(PCRE2_SIZE size, void *user) {
40 return memalloc(size);
41}
42
43static void _regex_free(void *ptr, void *user) {
44 if (ptr) {
45 memfree(ptr);
46 }
47}
48
49int RegExMatch::_find(const Variant &p_name) const {
50 if (p_name.is_num()) {
51 int i = (int)p_name;
52 if (i >= data.size()) {
53 return -1;
54 }
55 return i;
56 } else if (p_name.get_type() == Variant::STRING || p_name.get_type() == Variant::STRING_NAME) {
57 HashMap<String, int>::ConstIterator found = names.find((String)p_name);
58 if (found) {
59 return found->value;
60 }
61 }
62
63 return -1;
64}
65
66String RegExMatch::get_subject() const {
67 return subject;
68}
69
70int RegExMatch::get_group_count() const {
71 if (data.size() == 0) {
72 return 0;
73 }
74 return data.size() - 1;
75}
76
77Dictionary RegExMatch::get_names() const {
78 Dictionary result;
79
80 for (const KeyValue<String, int> &E : names) {
81 result[E.key] = E.value;
82 }
83
84 return result;
85}
86
87PackedStringArray RegExMatch::get_strings() const {
88 PackedStringArray result;
89
90 int size = data.size();
91
92 for (int i = 0; i < size; i++) {
93 int start = data[i].start;
94
95 if (start == -1) {
96 result.append(String());
97 continue;
98 }
99
100 int length = data[i].end - start;
101
102 result.append(subject.substr(start, length));
103 }
104
105 return result;
106}
107
108String RegExMatch::get_string(const Variant &p_name) const {
109 int id = _find(p_name);
110
111 if (id < 0) {
112 return String();
113 }
114
115 int start = data[id].start;
116
117 if (start == -1) {
118 return String();
119 }
120
121 int length = data[id].end - start;
122
123 return subject.substr(start, length);
124}
125
126int RegExMatch::get_start(const Variant &p_name) const {
127 int id = _find(p_name);
128
129 if (id < 0) {
130 return -1;
131 }
132
133 return data[id].start;
134}
135
136int RegExMatch::get_end(const Variant &p_name) const {
137 int id = _find(p_name);
138
139 if (id < 0) {
140 return -1;
141 }
142
143 return data[id].end;
144}
145
146void RegExMatch::_bind_methods() {
147 ClassDB::bind_method(D_METHOD("get_subject"), &RegExMatch::get_subject);
148 ClassDB::bind_method(D_METHOD("get_group_count"), &RegExMatch::get_group_count);
149 ClassDB::bind_method(D_METHOD("get_names"), &RegExMatch::get_names);
150 ClassDB::bind_method(D_METHOD("get_strings"), &RegExMatch::get_strings);
151 ClassDB::bind_method(D_METHOD("get_string", "name"), &RegExMatch::get_string, DEFVAL(0));
152 ClassDB::bind_method(D_METHOD("get_start", "name"), &RegExMatch::get_start, DEFVAL(0));
153 ClassDB::bind_method(D_METHOD("get_end", "name"), &RegExMatch::get_end, DEFVAL(0));
154
155 ADD_PROPERTY(PropertyInfo(Variant::STRING, "subject"), "", "get_subject");
156 ADD_PROPERTY(PropertyInfo(Variant::DICTIONARY, "names"), "", "get_names");
157 ADD_PROPERTY(PropertyInfo(Variant::ARRAY, "strings"), "", "get_strings");
158}
159
160void RegEx::_pattern_info(uint32_t what, void *where) const {
161 pcre2_pattern_info_32((pcre2_code_32 *)code, what, where);
162}
163
164Ref<RegEx> RegEx::create_from_string(const String &p_pattern) {
165 Ref<RegEx> ret;
166 ret.instantiate();
167 ret->compile(p_pattern);
168 return ret;
169}
170
171void RegEx::clear() {
172 if (code) {
173 pcre2_code_free_32((pcre2_code_32 *)code);
174 code = nullptr;
175 }
176}
177
178Error RegEx::compile(const String &p_pattern) {
179 pattern = p_pattern;
180 clear();
181
182 int err;
183 PCRE2_SIZE offset;
184 uint32_t flags = PCRE2_DUPNAMES;
185
186 pcre2_general_context_32 *gctx = (pcre2_general_context_32 *)general_ctx;
187 pcre2_compile_context_32 *cctx = pcre2_compile_context_create_32(gctx);
188 PCRE2_SPTR32 p = (PCRE2_SPTR32)pattern.get_data();
189
190 code = pcre2_compile_32(p, pattern.length(), flags, &err, &offset, cctx);
191
192 pcre2_compile_context_free_32(cctx);
193
194 if (!code) {
195 PCRE2_UCHAR32 buf[256];
196 pcre2_get_error_message_32(err, buf, 256);
197 String message = String::num(offset) + ": " + String((const char32_t *)buf);
198 ERR_PRINT(message.utf8());
199 return FAILED;
200 }
201 return OK;
202}
203
204Ref<RegExMatch> RegEx::search(const String &p_subject, int p_offset, int p_end) const {
205 ERR_FAIL_COND_V(!is_valid(), nullptr);
206 ERR_FAIL_COND_V_MSG(p_offset < 0, nullptr, "RegEx search offset must be >= 0");
207
208 Ref<RegExMatch> result = memnew(RegExMatch);
209
210 int length = p_subject.length();
211 if (p_end >= 0 && p_end < length) {
212 length = p_end;
213 }
214
215 pcre2_code_32 *c = (pcre2_code_32 *)code;
216 pcre2_general_context_32 *gctx = (pcre2_general_context_32 *)general_ctx;
217 pcre2_match_context_32 *mctx = pcre2_match_context_create_32(gctx);
218 PCRE2_SPTR32 s = (PCRE2_SPTR32)p_subject.get_data();
219
220 pcre2_match_data_32 *match = pcre2_match_data_create_from_pattern_32(c, gctx);
221
222 int res = pcre2_match_32(c, s, length, p_offset, 0, match, mctx);
223
224 if (res < 0) {
225 pcre2_match_data_free_32(match);
226 pcre2_match_context_free_32(mctx);
227
228 return nullptr;
229 }
230
231 uint32_t size = pcre2_get_ovector_count_32(match);
232 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_32(match);
233
234 result->data.resize(size);
235
236 for (uint32_t i = 0; i < size; i++) {
237 result->data.write[i].start = ovector[i * 2];
238 result->data.write[i].end = ovector[i * 2 + 1];
239 }
240
241 pcre2_match_data_free_32(match);
242 pcre2_match_context_free_32(mctx);
243
244 result->subject = p_subject;
245
246 uint32_t count;
247 const char32_t *table;
248 uint32_t entry_size;
249
250 _pattern_info(PCRE2_INFO_NAMECOUNT, &count);
251 _pattern_info(PCRE2_INFO_NAMETABLE, &table);
252 _pattern_info(PCRE2_INFO_NAMEENTRYSIZE, &entry_size);
253
254 for (uint32_t i = 0; i < count; i++) {
255 char32_t id = table[i * entry_size];
256 if (result->data[id].start == -1) {
257 continue;
258 }
259 String name = &table[i * entry_size + 1];
260 if (result->names.has(name)) {
261 continue;
262 }
263
264 result->names.insert(name, id);
265 }
266
267 return result;
268}
269
270TypedArray<RegExMatch> RegEx::search_all(const String &p_subject, int p_offset, int p_end) const {
271 ERR_FAIL_COND_V_MSG(p_offset < 0, Array(), "RegEx search offset must be >= 0");
272
273 int last_end = -1;
274 TypedArray<RegExMatch> result;
275 Ref<RegExMatch> match = search(p_subject, p_offset, p_end);
276 while (match.is_valid()) {
277 if (last_end == match->get_end(0)) {
278 break;
279 }
280 result.push_back(match);
281 last_end = match->get_end(0);
282 match = search(p_subject, match->get_end(0), p_end);
283 }
284 return result;
285}
286
287String RegEx::sub(const String &p_subject, const String &p_replacement, bool p_all, int p_offset, int p_end) const {
288 ERR_FAIL_COND_V(!is_valid(), String());
289 ERR_FAIL_COND_V_MSG(p_offset < 0, String(), "RegEx sub offset must be >= 0");
290
291 // safety_zone is the number of chars we allocate in addition to the number of chars expected in order to
292 // guard against the PCRE API writing one additional \0 at the end. PCRE's API docs are unclear on whether
293 // PCRE understands outlength in pcre2_substitute() as counting an implicit additional terminating char or
294 // not. always allocating one char more than telling PCRE has us on the safe side.
295 const int safety_zone = 1;
296
297 PCRE2_SIZE olength = p_subject.length() + 1; // space for output string and one terminating \0 character
298 Vector<char32_t> output;
299 output.resize(olength + safety_zone);
300
301 uint32_t flags = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH;
302 if (p_all) {
303 flags |= PCRE2_SUBSTITUTE_GLOBAL;
304 }
305
306 PCRE2_SIZE length = p_subject.length();
307 if (p_end >= 0 && (uint32_t)p_end < length) {
308 length = p_end;
309 }
310
311 pcre2_code_32 *c = (pcre2_code_32 *)code;
312 pcre2_general_context_32 *gctx = (pcre2_general_context_32 *)general_ctx;
313 pcre2_match_context_32 *mctx = pcre2_match_context_create_32(gctx);
314 PCRE2_SPTR32 s = (PCRE2_SPTR32)p_subject.get_data();
315 PCRE2_SPTR32 r = (PCRE2_SPTR32)p_replacement.get_data();
316 PCRE2_UCHAR32 *o = (PCRE2_UCHAR32 *)output.ptrw();
317
318 pcre2_match_data_32 *match = pcre2_match_data_create_from_pattern_32(c, gctx);
319
320 int res = pcre2_substitute_32(c, s, length, p_offset, flags, match, mctx, r, p_replacement.length(), o, &olength);
321
322 if (res == PCRE2_ERROR_NOMEMORY) {
323 output.resize(olength + safety_zone);
324 o = (PCRE2_UCHAR32 *)output.ptrw();
325 res = pcre2_substitute_32(c, s, length, p_offset, flags, match, mctx, r, p_replacement.length(), o, &olength);
326 }
327
328 pcre2_match_data_free_32(match);
329 pcre2_match_context_free_32(mctx);
330
331 if (res < 0) {
332 return String();
333 }
334
335 return String(output.ptr(), olength);
336}
337
338bool RegEx::is_valid() const {
339 return (code != nullptr);
340}
341
342String RegEx::get_pattern() const {
343 return pattern;
344}
345
346int RegEx::get_group_count() const {
347 ERR_FAIL_COND_V(!is_valid(), 0);
348
349 uint32_t count;
350
351 _pattern_info(PCRE2_INFO_CAPTURECOUNT, &count);
352
353 return count;
354}
355
356PackedStringArray RegEx::get_names() const {
357 PackedStringArray result;
358
359 ERR_FAIL_COND_V(!is_valid(), result);
360
361 uint32_t count;
362 const char32_t *table;
363 uint32_t entry_size;
364
365 _pattern_info(PCRE2_INFO_NAMECOUNT, &count);
366 _pattern_info(PCRE2_INFO_NAMETABLE, &table);
367 _pattern_info(PCRE2_INFO_NAMEENTRYSIZE, &entry_size);
368
369 for (uint32_t i = 0; i < count; i++) {
370 String name = &table[i * entry_size + 1];
371 if (result.find(name) < 0) {
372 result.append(name);
373 }
374 }
375
376 return result;
377}
378
379RegEx::RegEx() {
380 general_ctx = pcre2_general_context_create_32(&_regex_malloc, &_regex_free, nullptr);
381}
382
383RegEx::RegEx(const String &p_pattern) {
384 general_ctx = pcre2_general_context_create_32(&_regex_malloc, &_regex_free, nullptr);
385 compile(p_pattern);
386}
387
388RegEx::~RegEx() {
389 if (code) {
390 pcre2_code_free_32((pcre2_code_32 *)code);
391 }
392 pcre2_general_context_free_32((pcre2_general_context_32 *)general_ctx);
393}
394
395void RegEx::_bind_methods() {
396 ClassDB::bind_static_method("RegEx", D_METHOD("create_from_string", "pattern"), &RegEx::create_from_string);
397
398 ClassDB::bind_method(D_METHOD("clear"), &RegEx::clear);
399 ClassDB::bind_method(D_METHOD("compile", "pattern"), &RegEx::compile);
400 ClassDB::bind_method(D_METHOD("search", "subject", "offset", "end"), &RegEx::search, DEFVAL(0), DEFVAL(-1));
401 ClassDB::bind_method(D_METHOD("search_all", "subject", "offset", "end"), &RegEx::search_all, DEFVAL(0), DEFVAL(-1));
402 ClassDB::bind_method(D_METHOD("sub", "subject", "replacement", "all", "offset", "end"), &RegEx::sub, DEFVAL(false), DEFVAL(0), DEFVAL(-1));
403 ClassDB::bind_method(D_METHOD("is_valid"), &RegEx::is_valid);
404 ClassDB::bind_method(D_METHOD("get_pattern"), &RegEx::get_pattern);
405 ClassDB::bind_method(D_METHOD("get_group_count"), &RegEx::get_group_count);
406 ClassDB::bind_method(D_METHOD("get_names"), &RegEx::get_names);
407}
408