| 1 | /**************************************************************************/ |
| 2 | /* regex.cpp */ |
| 3 | /**************************************************************************/ |
| 4 | /* This file is part of: */ |
| 5 | /* GODOT ENGINE */ |
| 6 | /* https://godotengine.org */ |
| 7 | /**************************************************************************/ |
| 8 | /* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ |
| 9 | /* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ |
| 10 | /* */ |
| 11 | /* Permission is hereby granted, free of charge, to any person obtaining */ |
| 12 | /* a copy of this software and associated documentation files (the */ |
| 13 | /* "Software"), to deal in the Software without restriction, including */ |
| 14 | /* without limitation the rights to use, copy, modify, merge, publish, */ |
| 15 | /* distribute, sublicense, and/or sell copies of the Software, and to */ |
| 16 | /* permit persons to whom the Software is furnished to do so, subject to */ |
| 17 | /* the following conditions: */ |
| 18 | /* */ |
| 19 | /* The above copyright notice and this permission notice shall be */ |
| 20 | /* included in all copies or substantial portions of the Software. */ |
| 21 | /* */ |
| 22 | /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ |
| 23 | /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ |
| 24 | /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ |
| 25 | /* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ |
| 26 | /* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ |
| 27 | /* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ |
| 28 | /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ |
| 29 | /**************************************************************************/ |
| 30 | |
| 31 | #include "regex.h" |
| 32 | |
| 33 | #include "core/os/memory.h" |
| 34 | |
| 35 | extern "C" { |
| 36 | #include <pcre2.h> |
| 37 | } |
| 38 | |
| 39 | static void *_regex_malloc(PCRE2_SIZE size, void *user) { |
| 40 | return memalloc(size); |
| 41 | } |
| 42 | |
| 43 | static void _regex_free(void *ptr, void *user) { |
| 44 | if (ptr) { |
| 45 | memfree(ptr); |
| 46 | } |
| 47 | } |
| 48 | |
| 49 | int RegExMatch::_find(const Variant &p_name) const { |
| 50 | if (p_name.is_num()) { |
| 51 | int i = (int)p_name; |
| 52 | if (i >= data.size()) { |
| 53 | return -1; |
| 54 | } |
| 55 | return i; |
| 56 | } else if (p_name.get_type() == Variant::STRING || p_name.get_type() == Variant::STRING_NAME) { |
| 57 | HashMap<String, int>::ConstIterator found = names.find((String)p_name); |
| 58 | if (found) { |
| 59 | return found->value; |
| 60 | } |
| 61 | } |
| 62 | |
| 63 | return -1; |
| 64 | } |
| 65 | |
| 66 | String RegExMatch::get_subject() const { |
| 67 | return subject; |
| 68 | } |
| 69 | |
| 70 | int RegExMatch::get_group_count() const { |
| 71 | if (data.size() == 0) { |
| 72 | return 0; |
| 73 | } |
| 74 | return data.size() - 1; |
| 75 | } |
| 76 | |
| 77 | Dictionary RegExMatch::get_names() const { |
| 78 | Dictionary result; |
| 79 | |
| 80 | for (const KeyValue<String, int> &E : names) { |
| 81 | result[E.key] = E.value; |
| 82 | } |
| 83 | |
| 84 | return result; |
| 85 | } |
| 86 | |
| 87 | PackedStringArray RegExMatch::get_strings() const { |
| 88 | PackedStringArray result; |
| 89 | |
| 90 | int size = data.size(); |
| 91 | |
| 92 | for (int i = 0; i < size; i++) { |
| 93 | int start = data[i].start; |
| 94 | |
| 95 | if (start == -1) { |
| 96 | result.append(String()); |
| 97 | continue; |
| 98 | } |
| 99 | |
| 100 | int length = data[i].end - start; |
| 101 | |
| 102 | result.append(subject.substr(start, length)); |
| 103 | } |
| 104 | |
| 105 | return result; |
| 106 | } |
| 107 | |
| 108 | String RegExMatch::get_string(const Variant &p_name) const { |
| 109 | int id = _find(p_name); |
| 110 | |
| 111 | if (id < 0) { |
| 112 | return String(); |
| 113 | } |
| 114 | |
| 115 | int start = data[id].start; |
| 116 | |
| 117 | if (start == -1) { |
| 118 | return String(); |
| 119 | } |
| 120 | |
| 121 | int length = data[id].end - start; |
| 122 | |
| 123 | return subject.substr(start, length); |
| 124 | } |
| 125 | |
| 126 | int RegExMatch::get_start(const Variant &p_name) const { |
| 127 | int id = _find(p_name); |
| 128 | |
| 129 | if (id < 0) { |
| 130 | return -1; |
| 131 | } |
| 132 | |
| 133 | return data[id].start; |
| 134 | } |
| 135 | |
| 136 | int RegExMatch::get_end(const Variant &p_name) const { |
| 137 | int id = _find(p_name); |
| 138 | |
| 139 | if (id < 0) { |
| 140 | return -1; |
| 141 | } |
| 142 | |
| 143 | return data[id].end; |
| 144 | } |
| 145 | |
| 146 | void RegExMatch::_bind_methods() { |
| 147 | ClassDB::bind_method(D_METHOD("get_subject" ), &RegExMatch::get_subject); |
| 148 | ClassDB::bind_method(D_METHOD("get_group_count" ), &RegExMatch::get_group_count); |
| 149 | ClassDB::bind_method(D_METHOD("get_names" ), &RegExMatch::get_names); |
| 150 | ClassDB::bind_method(D_METHOD("get_strings" ), &RegExMatch::get_strings); |
| 151 | ClassDB::bind_method(D_METHOD("get_string" , "name" ), &RegExMatch::get_string, DEFVAL(0)); |
| 152 | ClassDB::bind_method(D_METHOD("get_start" , "name" ), &RegExMatch::get_start, DEFVAL(0)); |
| 153 | ClassDB::bind_method(D_METHOD("get_end" , "name" ), &RegExMatch::get_end, DEFVAL(0)); |
| 154 | |
| 155 | ADD_PROPERTY(PropertyInfo(Variant::STRING, "subject" ), "" , "get_subject" ); |
| 156 | ADD_PROPERTY(PropertyInfo(Variant::DICTIONARY, "names" ), "" , "get_names" ); |
| 157 | ADD_PROPERTY(PropertyInfo(Variant::ARRAY, "strings" ), "" , "get_strings" ); |
| 158 | } |
| 159 | |
| 160 | void RegEx::_pattern_info(uint32_t what, void *where) const { |
| 161 | pcre2_pattern_info_32((pcre2_code_32 *)code, what, where); |
| 162 | } |
| 163 | |
| 164 | Ref<RegEx> RegEx::create_from_string(const String &p_pattern) { |
| 165 | Ref<RegEx> ret; |
| 166 | ret.instantiate(); |
| 167 | ret->compile(p_pattern); |
| 168 | return ret; |
| 169 | } |
| 170 | |
| 171 | void RegEx::clear() { |
| 172 | if (code) { |
| 173 | pcre2_code_free_32((pcre2_code_32 *)code); |
| 174 | code = nullptr; |
| 175 | } |
| 176 | } |
| 177 | |
| 178 | Error RegEx::compile(const String &p_pattern) { |
| 179 | pattern = p_pattern; |
| 180 | clear(); |
| 181 | |
| 182 | int err; |
| 183 | PCRE2_SIZE offset; |
| 184 | uint32_t flags = PCRE2_DUPNAMES; |
| 185 | |
| 186 | pcre2_general_context_32 *gctx = (pcre2_general_context_32 *)general_ctx; |
| 187 | pcre2_compile_context_32 *cctx = pcre2_compile_context_create_32(gctx); |
| 188 | PCRE2_SPTR32 p = (PCRE2_SPTR32)pattern.get_data(); |
| 189 | |
| 190 | code = pcre2_compile_32(p, pattern.length(), flags, &err, &offset, cctx); |
| 191 | |
| 192 | pcre2_compile_context_free_32(cctx); |
| 193 | |
| 194 | if (!code) { |
| 195 | PCRE2_UCHAR32 buf[256]; |
| 196 | pcre2_get_error_message_32(err, buf, 256); |
| 197 | String message = String::num(offset) + ": " + String((const char32_t *)buf); |
| 198 | ERR_PRINT(message.utf8()); |
| 199 | return FAILED; |
| 200 | } |
| 201 | return OK; |
| 202 | } |
| 203 | |
| 204 | Ref<RegExMatch> RegEx::search(const String &p_subject, int p_offset, int p_end) const { |
| 205 | ERR_FAIL_COND_V(!is_valid(), nullptr); |
| 206 | ERR_FAIL_COND_V_MSG(p_offset < 0, nullptr, "RegEx search offset must be >= 0" ); |
| 207 | |
| 208 | Ref<RegExMatch> result = memnew(RegExMatch); |
| 209 | |
| 210 | int length = p_subject.length(); |
| 211 | if (p_end >= 0 && p_end < length) { |
| 212 | length = p_end; |
| 213 | } |
| 214 | |
| 215 | pcre2_code_32 *c = (pcre2_code_32 *)code; |
| 216 | pcre2_general_context_32 *gctx = (pcre2_general_context_32 *)general_ctx; |
| 217 | pcre2_match_context_32 *mctx = pcre2_match_context_create_32(gctx); |
| 218 | PCRE2_SPTR32 s = (PCRE2_SPTR32)p_subject.get_data(); |
| 219 | |
| 220 | pcre2_match_data_32 *match = pcre2_match_data_create_from_pattern_32(c, gctx); |
| 221 | |
| 222 | int res = pcre2_match_32(c, s, length, p_offset, 0, match, mctx); |
| 223 | |
| 224 | if (res < 0) { |
| 225 | pcre2_match_data_free_32(match); |
| 226 | pcre2_match_context_free_32(mctx); |
| 227 | |
| 228 | return nullptr; |
| 229 | } |
| 230 | |
| 231 | uint32_t size = pcre2_get_ovector_count_32(match); |
| 232 | PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_32(match); |
| 233 | |
| 234 | result->data.resize(size); |
| 235 | |
| 236 | for (uint32_t i = 0; i < size; i++) { |
| 237 | result->data.write[i].start = ovector[i * 2]; |
| 238 | result->data.write[i].end = ovector[i * 2 + 1]; |
| 239 | } |
| 240 | |
| 241 | pcre2_match_data_free_32(match); |
| 242 | pcre2_match_context_free_32(mctx); |
| 243 | |
| 244 | result->subject = p_subject; |
| 245 | |
| 246 | uint32_t count; |
| 247 | const char32_t *table; |
| 248 | uint32_t entry_size; |
| 249 | |
| 250 | _pattern_info(PCRE2_INFO_NAMECOUNT, &count); |
| 251 | _pattern_info(PCRE2_INFO_NAMETABLE, &table); |
| 252 | _pattern_info(PCRE2_INFO_NAMEENTRYSIZE, &entry_size); |
| 253 | |
| 254 | for (uint32_t i = 0; i < count; i++) { |
| 255 | char32_t id = table[i * entry_size]; |
| 256 | if (result->data[id].start == -1) { |
| 257 | continue; |
| 258 | } |
| 259 | String name = &table[i * entry_size + 1]; |
| 260 | if (result->names.has(name)) { |
| 261 | continue; |
| 262 | } |
| 263 | |
| 264 | result->names.insert(name, id); |
| 265 | } |
| 266 | |
| 267 | return result; |
| 268 | } |
| 269 | |
| 270 | TypedArray<RegExMatch> RegEx::search_all(const String &p_subject, int p_offset, int p_end) const { |
| 271 | ERR_FAIL_COND_V_MSG(p_offset < 0, Array(), "RegEx search offset must be >= 0" ); |
| 272 | |
| 273 | int last_end = -1; |
| 274 | TypedArray<RegExMatch> result; |
| 275 | Ref<RegExMatch> match = search(p_subject, p_offset, p_end); |
| 276 | while (match.is_valid()) { |
| 277 | if (last_end == match->get_end(0)) { |
| 278 | break; |
| 279 | } |
| 280 | result.push_back(match); |
| 281 | last_end = match->get_end(0); |
| 282 | match = search(p_subject, match->get_end(0), p_end); |
| 283 | } |
| 284 | return result; |
| 285 | } |
| 286 | |
| 287 | String RegEx::sub(const String &p_subject, const String &p_replacement, bool p_all, int p_offset, int p_end) const { |
| 288 | ERR_FAIL_COND_V(!is_valid(), String()); |
| 289 | ERR_FAIL_COND_V_MSG(p_offset < 0, String(), "RegEx sub offset must be >= 0" ); |
| 290 | |
| 291 | // safety_zone is the number of chars we allocate in addition to the number of chars expected in order to |
| 292 | // guard against the PCRE API writing one additional \0 at the end. PCRE's API docs are unclear on whether |
| 293 | // PCRE understands outlength in pcre2_substitute() as counting an implicit additional terminating char or |
| 294 | // not. always allocating one char more than telling PCRE has us on the safe side. |
| 295 | const int safety_zone = 1; |
| 296 | |
| 297 | PCRE2_SIZE olength = p_subject.length() + 1; // space for output string and one terminating \0 character |
| 298 | Vector<char32_t> output; |
| 299 | output.resize(olength + safety_zone); |
| 300 | |
| 301 | uint32_t flags = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH; |
| 302 | if (p_all) { |
| 303 | flags |= PCRE2_SUBSTITUTE_GLOBAL; |
| 304 | } |
| 305 | |
| 306 | PCRE2_SIZE length = p_subject.length(); |
| 307 | if (p_end >= 0 && (uint32_t)p_end < length) { |
| 308 | length = p_end; |
| 309 | } |
| 310 | |
| 311 | pcre2_code_32 *c = (pcre2_code_32 *)code; |
| 312 | pcre2_general_context_32 *gctx = (pcre2_general_context_32 *)general_ctx; |
| 313 | pcre2_match_context_32 *mctx = pcre2_match_context_create_32(gctx); |
| 314 | PCRE2_SPTR32 s = (PCRE2_SPTR32)p_subject.get_data(); |
| 315 | PCRE2_SPTR32 r = (PCRE2_SPTR32)p_replacement.get_data(); |
| 316 | PCRE2_UCHAR32 *o = (PCRE2_UCHAR32 *)output.ptrw(); |
| 317 | |
| 318 | pcre2_match_data_32 *match = pcre2_match_data_create_from_pattern_32(c, gctx); |
| 319 | |
| 320 | int res = pcre2_substitute_32(c, s, length, p_offset, flags, match, mctx, r, p_replacement.length(), o, &olength); |
| 321 | |
| 322 | if (res == PCRE2_ERROR_NOMEMORY) { |
| 323 | output.resize(olength + safety_zone); |
| 324 | o = (PCRE2_UCHAR32 *)output.ptrw(); |
| 325 | res = pcre2_substitute_32(c, s, length, p_offset, flags, match, mctx, r, p_replacement.length(), o, &olength); |
| 326 | } |
| 327 | |
| 328 | pcre2_match_data_free_32(match); |
| 329 | pcre2_match_context_free_32(mctx); |
| 330 | |
| 331 | if (res < 0) { |
| 332 | return String(); |
| 333 | } |
| 334 | |
| 335 | return String(output.ptr(), olength); |
| 336 | } |
| 337 | |
| 338 | bool RegEx::is_valid() const { |
| 339 | return (code != nullptr); |
| 340 | } |
| 341 | |
| 342 | String RegEx::get_pattern() const { |
| 343 | return pattern; |
| 344 | } |
| 345 | |
| 346 | int RegEx::get_group_count() const { |
| 347 | ERR_FAIL_COND_V(!is_valid(), 0); |
| 348 | |
| 349 | uint32_t count; |
| 350 | |
| 351 | _pattern_info(PCRE2_INFO_CAPTURECOUNT, &count); |
| 352 | |
| 353 | return count; |
| 354 | } |
| 355 | |
| 356 | PackedStringArray RegEx::get_names() const { |
| 357 | PackedStringArray result; |
| 358 | |
| 359 | ERR_FAIL_COND_V(!is_valid(), result); |
| 360 | |
| 361 | uint32_t count; |
| 362 | const char32_t *table; |
| 363 | uint32_t entry_size; |
| 364 | |
| 365 | _pattern_info(PCRE2_INFO_NAMECOUNT, &count); |
| 366 | _pattern_info(PCRE2_INFO_NAMETABLE, &table); |
| 367 | _pattern_info(PCRE2_INFO_NAMEENTRYSIZE, &entry_size); |
| 368 | |
| 369 | for (uint32_t i = 0; i < count; i++) { |
| 370 | String name = &table[i * entry_size + 1]; |
| 371 | if (result.find(name) < 0) { |
| 372 | result.append(name); |
| 373 | } |
| 374 | } |
| 375 | |
| 376 | return result; |
| 377 | } |
| 378 | |
| 379 | RegEx::RegEx() { |
| 380 | general_ctx = pcre2_general_context_create_32(&_regex_malloc, &_regex_free, nullptr); |
| 381 | } |
| 382 | |
| 383 | RegEx::RegEx(const String &p_pattern) { |
| 384 | general_ctx = pcre2_general_context_create_32(&_regex_malloc, &_regex_free, nullptr); |
| 385 | compile(p_pattern); |
| 386 | } |
| 387 | |
| 388 | RegEx::~RegEx() { |
| 389 | if (code) { |
| 390 | pcre2_code_free_32((pcre2_code_32 *)code); |
| 391 | } |
| 392 | pcre2_general_context_free_32((pcre2_general_context_32 *)general_ctx); |
| 393 | } |
| 394 | |
| 395 | void RegEx::_bind_methods() { |
| 396 | ClassDB::bind_static_method("RegEx" , D_METHOD("create_from_string" , "pattern" ), &RegEx::create_from_string); |
| 397 | |
| 398 | ClassDB::bind_method(D_METHOD("clear" ), &RegEx::clear); |
| 399 | ClassDB::bind_method(D_METHOD("compile" , "pattern" ), &RegEx::compile); |
| 400 | ClassDB::bind_method(D_METHOD("search" , "subject" , "offset" , "end" ), &RegEx::search, DEFVAL(0), DEFVAL(-1)); |
| 401 | ClassDB::bind_method(D_METHOD("search_all" , "subject" , "offset" , "end" ), &RegEx::search_all, DEFVAL(0), DEFVAL(-1)); |
| 402 | ClassDB::bind_method(D_METHOD("sub" , "subject" , "replacement" , "all" , "offset" , "end" ), &RegEx::sub, DEFVAL(false), DEFVAL(0), DEFVAL(-1)); |
| 403 | ClassDB::bind_method(D_METHOD("is_valid" ), &RegEx::is_valid); |
| 404 | ClassDB::bind_method(D_METHOD("get_pattern" ), &RegEx::get_pattern); |
| 405 | ClassDB::bind_method(D_METHOD("get_group_count" ), &RegEx::get_group_count); |
| 406 | ClassDB::bind_method(D_METHOD("get_names" ), &RegEx::get_names); |
| 407 | } |
| 408 | |