1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ********************************************************************** |
5 | * Copyright (C) 2004-2016, International Business Machines |
6 | * Corporation and others. All Rights Reserved. |
7 | ********************************************************************** |
8 | * file name: uregex.h |
9 | * encoding: UTF-8 |
10 | * indentation:4 |
11 | * |
12 | * created on: 2004mar09 |
13 | * created by: Andy Heninger |
14 | * |
15 | * ICU Regular Expressions, API for C |
16 | */ |
17 | |
18 | /** |
19 | * \file |
20 | * \brief C API: Regular Expressions |
21 | * |
22 | * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p> |
23 | */ |
24 | |
25 | #ifndef UREGEX_H |
26 | #define UREGEX_H |
27 | |
28 | #include "unicode/utext.h" |
29 | #include "unicode/utypes.h" |
30 | |
31 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
32 | |
33 | #include "unicode/localpointer.h" |
34 | #include "unicode/parseerr.h" |
35 | |
36 | struct URegularExpression; |
37 | /** |
38 | * Structure representing a compiled regular expression, plus the results |
39 | * of a match operation. |
40 | * @stable ICU 3.0 |
41 | */ |
42 | typedef struct URegularExpression URegularExpression; |
43 | |
44 | |
45 | /** |
46 | * Constants for Regular Expression Match Modes. |
47 | * @stable ICU 2.4 |
48 | */ |
49 | typedef enum URegexpFlag{ |
50 | |
51 | #ifndef U_HIDE_DRAFT_API |
52 | /** Forces normalization of pattern and strings. |
53 | Not implemented yet, just a placeholder, hence draft. |
54 | @draft ICU 2.4 */ |
55 | UREGEX_CANON_EQ = 128, |
56 | #endif /* U_HIDE_DRAFT_API */ |
57 | /** Enable case insensitive matching. @stable ICU 2.4 */ |
58 | UREGEX_CASE_INSENSITIVE = 2, |
59 | |
60 | /** Allow white space and comments within patterns @stable ICU 2.4 */ |
61 | = 4, |
62 | |
63 | /** If set, '.' matches line terminators, otherwise '.' matching stops at line end. |
64 | * @stable ICU 2.4 */ |
65 | UREGEX_DOTALL = 32, |
66 | |
67 | /** If set, treat the entire pattern as a literal string. |
68 | * Metacharacters or escape sequences in the input sequence will be given |
69 | * no special meaning. |
70 | * |
71 | * The flag UREGEX_CASE_INSENSITIVE retains its impact |
72 | * on matching when used in conjunction with this flag. |
73 | * The other flags become superfluous. |
74 | * |
75 | * @stable ICU 4.0 |
76 | */ |
77 | UREGEX_LITERAL = 16, |
78 | |
79 | /** Control behavior of "$" and "^" |
80 | * If set, recognize line terminators within string, |
81 | * otherwise, match only at start and end of input string. |
82 | * @stable ICU 2.4 */ |
83 | UREGEX_MULTILINE = 8, |
84 | |
85 | /** Unix-only line endings. |
86 | * When this mode is enabled, only \\u000a is recognized as a line ending |
87 | * in the behavior of ., ^, and $. |
88 | * @stable ICU 4.0 |
89 | */ |
90 | UREGEX_UNIX_LINES = 1, |
91 | |
92 | /** Unicode word boundaries. |
93 | * If set, \b uses the Unicode TR 29 definition of word boundaries. |
94 | * Warning: Unicode word boundaries are quite different from |
95 | * traditional regular expression word boundaries. See |
96 | * http://unicode.org/reports/tr29/#Word_Boundaries |
97 | * @stable ICU 2.8 |
98 | */ |
99 | UREGEX_UWORD = 256, |
100 | |
101 | /** Error on Unrecognized backslash escapes. |
102 | * If set, fail with an error on patterns that contain |
103 | * backslash-escaped ASCII letters without a known special |
104 | * meaning. If this flag is not set, these |
105 | * escaped letters represent themselves. |
106 | * @stable ICU 4.0 |
107 | */ |
108 | UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512 |
109 | |
110 | } URegexpFlag; |
111 | |
112 | /** |
113 | * Open (compile) an ICU regular expression. Compiles the regular expression in |
114 | * string form into an internal representation using the specified match mode flags. |
115 | * The resulting regular expression handle can then be used to perform various |
116 | * matching operations. |
117 | * |
118 | * |
119 | * @param pattern The Regular Expression pattern to be compiled. |
120 | * @param patternLength The length of the pattern, or -1 if the pattern is |
121 | * NUL terminated. |
122 | * @param flags Flags that alter the default matching behavior for |
123 | * the regular expression, UREGEX_CASE_INSENSITIVE, for |
124 | * example. For default behavior, set this parameter to zero. |
125 | * See <code>enum URegexpFlag</code>. All desired flags |
126 | * are bitwise-ORed together. |
127 | * @param pe Receives the position (line and column numbers) of any syntax |
128 | * error within the source regular expression string. If this |
129 | * information is not wanted, pass NULL for this parameter. |
130 | * @param status Receives error detected by this function. |
131 | * @stable ICU 3.0 |
132 | * |
133 | */ |
134 | U_STABLE URegularExpression * U_EXPORT2 |
135 | uregex_open( const UChar *pattern, |
136 | int32_t patternLength, |
137 | uint32_t flags, |
138 | UParseError *pe, |
139 | UErrorCode *status); |
140 | |
141 | /** |
142 | * Open (compile) an ICU regular expression. Compiles the regular expression in |
143 | * string form into an internal representation using the specified match mode flags. |
144 | * The resulting regular expression handle can then be used to perform various |
145 | * matching operations. |
146 | * <p> |
147 | * The contents of the pattern UText will be extracted and saved. Ownership of the |
148 | * UText struct itself remains with the caller. This is to match the behavior of |
149 | * uregex_open(). |
150 | * |
151 | * @param pattern The Regular Expression pattern to be compiled. |
152 | * @param flags Flags that alter the default matching behavior for |
153 | * the regular expression, UREGEX_CASE_INSENSITIVE, for |
154 | * example. For default behavior, set this parameter to zero. |
155 | * See <code>enum URegexpFlag</code>. All desired flags |
156 | * are bitwise-ORed together. |
157 | * @param pe Receives the position (line and column numbers) of any syntax |
158 | * error within the source regular expression string. If this |
159 | * information is not wanted, pass NULL for this parameter. |
160 | * @param status Receives error detected by this function. |
161 | * |
162 | * @stable ICU 4.6 |
163 | */ |
164 | U_STABLE URegularExpression * U_EXPORT2 |
165 | uregex_openUText(UText *pattern, |
166 | uint32_t flags, |
167 | UParseError *pe, |
168 | UErrorCode *status); |
169 | |
170 | #if !UCONFIG_NO_CONVERSION |
171 | /** |
172 | * Open (compile) an ICU regular expression. The resulting regular expression |
173 | * handle can then be used to perform various matching operations. |
174 | * <p> |
175 | * This function is the same as uregex_open, except that the pattern |
176 | * is supplied as an 8 bit char * string in the default code page. |
177 | * |
178 | * @param pattern The Regular Expression pattern to be compiled, |
179 | * NUL terminated. |
180 | * @param flags Flags that alter the default matching behavior for |
181 | * the regular expression, UREGEX_CASE_INSENSITIVE, for |
182 | * example. For default behavior, set this parameter to zero. |
183 | * See <code>enum URegexpFlag</code>. All desired flags |
184 | * are bitwise-ORed together. |
185 | * @param pe Receives the position (line and column numbers) of any syntax |
186 | * error within the source regular expression string. If this |
187 | * information is not wanted, pass NULL for this parameter. |
188 | * @param status Receives errors detected by this function. |
189 | * @return The URegularExpression object representing the compiled |
190 | * pattern. |
191 | * |
192 | * @stable ICU 3.0 |
193 | */ |
194 | U_STABLE URegularExpression * U_EXPORT2 |
195 | uregex_openC( const char *pattern, |
196 | uint32_t flags, |
197 | UParseError *pe, |
198 | UErrorCode *status); |
199 | #endif |
200 | |
201 | |
202 | |
203 | /** |
204 | * Close the regular expression, recovering all resources (memory) it |
205 | * was holding. |
206 | * |
207 | * @param regexp The regular expression to be closed. |
208 | * @stable ICU 3.0 |
209 | */ |
210 | U_STABLE void U_EXPORT2 |
211 | uregex_close(URegularExpression *regexp); |
212 | |
213 | #if U_SHOW_CPLUSPLUS_API |
214 | |
215 | U_NAMESPACE_BEGIN |
216 | |
217 | /** |
218 | * \class LocalURegularExpressionPointer |
219 | * "Smart pointer" class, closes a URegularExpression via uregex_close(). |
220 | * For most methods see the LocalPointerBase base class. |
221 | * |
222 | * @see LocalPointerBase |
223 | * @see LocalPointer |
224 | * @stable ICU 4.4 |
225 | */ |
226 | U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression, uregex_close); |
227 | |
228 | U_NAMESPACE_END |
229 | |
230 | #endif |
231 | |
232 | /** |
233 | * Make a copy of a compiled regular expression. Cloning a regular |
234 | * expression is faster than opening a second instance from the source |
235 | * form of the expression, and requires less memory. |
236 | * <p> |
237 | * Note that the current input string and the position of any matched text |
238 | * within it are not cloned; only the pattern itself and the |
239 | * match mode flags are copied. |
240 | * <p> |
241 | * Cloning can be particularly useful to threaded applications that perform |
242 | * multiple match operations in parallel. Each concurrent RE |
243 | * operation requires its own instance of a URegularExpression. |
244 | * |
245 | * @param regexp The compiled regular expression to be cloned. |
246 | * @param status Receives indication of any errors encountered |
247 | * @return the cloned copy of the compiled regular expression. |
248 | * @stable ICU 3.0 |
249 | */ |
250 | U_STABLE URegularExpression * U_EXPORT2 |
251 | uregex_clone(const URegularExpression *regexp, UErrorCode *status); |
252 | |
253 | /** |
254 | * Returns a pointer to the source form of the pattern for this regular expression. |
255 | * This function will work even if the pattern was originally specified as a UText. |
256 | * |
257 | * @param regexp The compiled regular expression. |
258 | * @param patLength This output parameter will be set to the length of the |
259 | * pattern string. A NULL pointer may be used here if the |
260 | * pattern length is not needed, as would be the case if |
261 | * the pattern is known in advance to be a NUL terminated |
262 | * string. |
263 | * @param status Receives errors detected by this function. |
264 | * @return a pointer to the pattern string. The storage for the string is |
265 | * owned by the regular expression object, and must not be |
266 | * altered or deleted by the application. The returned string |
267 | * will remain valid until the regular expression is closed. |
268 | * @stable ICU 3.0 |
269 | */ |
270 | U_STABLE const UChar * U_EXPORT2 |
271 | uregex_pattern(const URegularExpression *regexp, |
272 | int32_t *patLength, |
273 | UErrorCode *status); |
274 | |
275 | /** |
276 | * Returns the source text of the pattern for this regular expression. |
277 | * This function will work even if the pattern was originally specified as a UChar string. |
278 | * |
279 | * @param regexp The compiled regular expression. |
280 | * @param status Receives errors detected by this function. |
281 | * @return the pattern text. The storage for the text is owned by the regular expression |
282 | * object, and must not be altered or deleted. |
283 | * |
284 | * @stable ICU 4.6 |
285 | */ |
286 | U_STABLE UText * U_EXPORT2 |
287 | uregex_patternUText(const URegularExpression *regexp, |
288 | UErrorCode *status); |
289 | |
290 | /** |
291 | * Get the match mode flags that were specified when compiling this regular expression. |
292 | * @param status Receives errors detected by this function. |
293 | * @param regexp The compiled regular expression. |
294 | * @return The match mode flags |
295 | * @see URegexpFlag |
296 | * @stable ICU 3.0 |
297 | */ |
298 | U_STABLE int32_t U_EXPORT2 |
299 | uregex_flags(const URegularExpression *regexp, |
300 | UErrorCode *status); |
301 | |
302 | |
303 | /** |
304 | * Set the subject text string upon which the regular expression will look for matches. |
305 | * This function may be called any number of times, allowing the regular |
306 | * expression pattern to be applied to different strings. |
307 | * <p> |
308 | * Regular expression matching operations work directly on the application's |
309 | * string data. No copy is made. The subject string data must not be |
310 | * altered after calling this function until after all regular expression |
311 | * operations involving this string data are completed. |
312 | * <p> |
313 | * Zero length strings are permitted. In this case, no subsequent match |
314 | * operation will dereference the text string pointer. |
315 | * |
316 | * @param regexp The compiled regular expression. |
317 | * @param text The subject text string. |
318 | * @param textLength The length of the subject text, or -1 if the string |
319 | * is NUL terminated. |
320 | * @param status Receives errors detected by this function. |
321 | * @stable ICU 3.0 |
322 | */ |
323 | U_STABLE void U_EXPORT2 |
324 | uregex_setText(URegularExpression *regexp, |
325 | const UChar *text, |
326 | int32_t textLength, |
327 | UErrorCode *status); |
328 | |
329 | |
330 | /** |
331 | * Set the subject text string upon which the regular expression will look for matches. |
332 | * This function may be called any number of times, allowing the regular |
333 | * expression pattern to be applied to different strings. |
334 | * <p> |
335 | * Regular expression matching operations work directly on the application's |
336 | * string data; only a shallow clone is made. The subject string data must not be |
337 | * altered after calling this function until after all regular expression |
338 | * operations involving this string data are completed. |
339 | * |
340 | * @param regexp The compiled regular expression. |
341 | * @param text The subject text string. |
342 | * @param status Receives errors detected by this function. |
343 | * |
344 | * @stable ICU 4.6 |
345 | */ |
346 | U_STABLE void U_EXPORT2 |
347 | uregex_setUText(URegularExpression *regexp, |
348 | UText *text, |
349 | UErrorCode *status); |
350 | |
351 | /** |
352 | * Get the subject text that is currently associated with this |
353 | * regular expression object. If the input was supplied using uregex_setText(), |
354 | * that pointer will be returned. Otherwise, the characters in the input will |
355 | * be extracted to a buffer and returned. In either case, ownership remains |
356 | * with the regular expression object. |
357 | * |
358 | * This function will work even if the input was originally specified as a UText. |
359 | * |
360 | * @param regexp The compiled regular expression. |
361 | * @param textLength The length of the string is returned in this output parameter. |
362 | * A NULL pointer may be used here if the |
363 | * text length is not needed, as would be the case if |
364 | * the text is known in advance to be a NUL terminated |
365 | * string. |
366 | * @param status Receives errors detected by this function. |
367 | * @return Pointer to the subject text string currently associated with |
368 | * this regular expression. |
369 | * @stable ICU 3.0 |
370 | */ |
371 | U_STABLE const UChar * U_EXPORT2 |
372 | uregex_getText(URegularExpression *regexp, |
373 | int32_t *textLength, |
374 | UErrorCode *status); |
375 | |
376 | /** |
377 | * Get the subject text that is currently associated with this |
378 | * regular expression object. |
379 | * |
380 | * This function will work even if the input was originally specified as a UChar string. |
381 | * |
382 | * @param regexp The compiled regular expression. |
383 | * @param dest A mutable UText in which to store the current input. |
384 | * If NULL, a new UText will be created as an immutable shallow clone |
385 | * of the actual input string. |
386 | * @param status Receives errors detected by this function. |
387 | * @return The subject text currently associated with this regular expression. |
388 | * If a pre-allocated UText was provided, it will always be used and returned. |
389 | * |
390 | * @stable ICU 4.6 |
391 | */ |
392 | U_STABLE UText * U_EXPORT2 |
393 | uregex_getUText(URegularExpression *regexp, |
394 | UText *dest, |
395 | UErrorCode *status); |
396 | |
397 | /** |
398 | * Set the subject text string upon which the regular expression is looking for matches |
399 | * without changing any other aspect of the matching state. |
400 | * The new and previous text strings must have the same content. |
401 | * |
402 | * This function is intended for use in environments where ICU is operating on |
403 | * strings that may move around in memory. It provides a mechanism for notifying |
404 | * ICU that the string has been relocated, and providing a new UText to access the |
405 | * string in its new position. |
406 | * |
407 | * Note that the regular expression implementation never copies the underlying text |
408 | * of a string being matched, but always operates directly on the original text |
409 | * provided by the user. Refreshing simply drops the references to the old text |
410 | * and replaces them with references to the new. |
411 | * |
412 | * Caution: this function is normally used only by very specialized |
413 | * system-level code. One example use case is with garbage collection |
414 | * that moves the text in memory. |
415 | * |
416 | * @param regexp The compiled regular expression. |
417 | * @param text The new (moved) text string. |
418 | * @param status Receives errors detected by this function. |
419 | * |
420 | * @stable ICU 4.8 |
421 | */ |
422 | U_STABLE void U_EXPORT2 |
423 | uregex_refreshUText(URegularExpression *regexp, |
424 | UText *text, |
425 | UErrorCode *status); |
426 | |
427 | /** |
428 | * Attempts to match the input string against the pattern. |
429 | * To succeed, the match must extend to the end of the string, |
430 | * or cover the complete match region. |
431 | * |
432 | * If startIndex >= zero the match operation starts at the specified |
433 | * index and must extend to the end of the input string. Any region |
434 | * that has been specified is reset. |
435 | * |
436 | * If startIndex == -1 the match must cover the input region, or the entire |
437 | * input string if no region has been set. This directly corresponds to |
438 | * Matcher.matches() in Java |
439 | * |
440 | * @param regexp The compiled regular expression. |
441 | * @param startIndex The input string (native) index at which to begin matching, or -1 |
442 | * to match the input Region. |
443 | * @param status Receives errors detected by this function. |
444 | * @return TRUE if there is a match |
445 | * @stable ICU 3.0 |
446 | */ |
447 | U_STABLE UBool U_EXPORT2 |
448 | uregex_matches(URegularExpression *regexp, |
449 | int32_t startIndex, |
450 | UErrorCode *status); |
451 | |
452 | /** |
453 | * 64bit version of uregex_matches. |
454 | * Attempts to match the input string against the pattern. |
455 | * To succeed, the match must extend to the end of the string, |
456 | * or cover the complete match region. |
457 | * |
458 | * If startIndex >= zero the match operation starts at the specified |
459 | * index and must extend to the end of the input string. Any region |
460 | * that has been specified is reset. |
461 | * |
462 | * If startIndex == -1 the match must cover the input region, or the entire |
463 | * input string if no region has been set. This directly corresponds to |
464 | * Matcher.matches() in Java |
465 | * |
466 | * @param regexp The compiled regular expression. |
467 | * @param startIndex The input string (native) index at which to begin matching, or -1 |
468 | * to match the input Region. |
469 | * @param status Receives errors detected by this function. |
470 | * @return TRUE if there is a match |
471 | * @stable ICU 4.6 |
472 | */ |
473 | U_STABLE UBool U_EXPORT2 |
474 | uregex_matches64(URegularExpression *regexp, |
475 | int64_t startIndex, |
476 | UErrorCode *status); |
477 | |
478 | /** |
479 | * Attempts to match the input string, starting from the specified index, against the pattern. |
480 | * The match may be of any length, and is not required to extend to the end |
481 | * of the input string. Contrast with uregex_matches(). |
482 | * |
483 | * <p>If startIndex is >= 0 any input region that was set for this |
484 | * URegularExpression is reset before the operation begins. |
485 | * |
486 | * <p>If the specified starting index == -1 the match begins at the start of the input |
487 | * region, or at the start of the full string if no region has been specified. |
488 | * This corresponds directly with Matcher.lookingAt() in Java. |
489 | * |
490 | * <p>If the match succeeds then more information can be obtained via the |
491 | * <code>uregexp_start()</code>, <code>uregexp_end()</code>, |
492 | * and <code>uregex_group()</code> functions.</p> |
493 | * |
494 | * @param regexp The compiled regular expression. |
495 | * @param startIndex The input string (native) index at which to begin matching, or |
496 | * -1 to match the Input Region |
497 | * @param status A reference to a UErrorCode to receive any errors. |
498 | * @return TRUE if there is a match. |
499 | * @stable ICU 3.0 |
500 | */ |
501 | U_STABLE UBool U_EXPORT2 |
502 | uregex_lookingAt(URegularExpression *regexp, |
503 | int32_t startIndex, |
504 | UErrorCode *status); |
505 | |
506 | /** |
507 | * 64bit version of uregex_lookingAt. |
508 | * Attempts to match the input string, starting from the specified index, against the pattern. |
509 | * The match may be of any length, and is not required to extend to the end |
510 | * of the input string. Contrast with uregex_matches(). |
511 | * |
512 | * <p>If startIndex is >= 0 any input region that was set for this |
513 | * URegularExpression is reset before the operation begins. |
514 | * |
515 | * <p>If the specified starting index == -1 the match begins at the start of the input |
516 | * region, or at the start of the full string if no region has been specified. |
517 | * This corresponds directly with Matcher.lookingAt() in Java. |
518 | * |
519 | * <p>If the match succeeds then more information can be obtained via the |
520 | * <code>uregexp_start()</code>, <code>uregexp_end()</code>, |
521 | * and <code>uregex_group()</code> functions.</p> |
522 | * |
523 | * @param regexp The compiled regular expression. |
524 | * @param startIndex The input string (native) index at which to begin matching, or |
525 | * -1 to match the Input Region |
526 | * @param status A reference to a UErrorCode to receive any errors. |
527 | * @return TRUE if there is a match. |
528 | * @stable ICU 4.6 |
529 | */ |
530 | U_STABLE UBool U_EXPORT2 |
531 | uregex_lookingAt64(URegularExpression *regexp, |
532 | int64_t startIndex, |
533 | UErrorCode *status); |
534 | |
535 | /** |
536 | * Find the first matching substring of the input string that matches the pattern. |
537 | * If startIndex is >= zero the search for a match begins at the specified index, |
538 | * and any match region is reset. This corresponds directly with |
539 | * Matcher.find(startIndex) in Java. |
540 | * |
541 | * If startIndex == -1 the search begins at the start of the input region, |
542 | * or at the start of the full string if no region has been specified. |
543 | * |
544 | * If a match is found, <code>uregex_start(), uregex_end()</code>, and |
545 | * <code>uregex_group()</code> will provide more information regarding the match. |
546 | * |
547 | * @param regexp The compiled regular expression. |
548 | * @param startIndex The position (native) in the input string to begin the search, or |
549 | * -1 to search within the Input Region. |
550 | * @param status A reference to a UErrorCode to receive any errors. |
551 | * @return TRUE if a match is found. |
552 | * @stable ICU 3.0 |
553 | */ |
554 | U_STABLE UBool U_EXPORT2 |
555 | uregex_find(URegularExpression *regexp, |
556 | int32_t startIndex, |
557 | UErrorCode *status); |
558 | |
559 | /** |
560 | * 64bit version of uregex_find. |
561 | * Find the first matching substring of the input string that matches the pattern. |
562 | * If startIndex is >= zero the search for a match begins at the specified index, |
563 | * and any match region is reset. This corresponds directly with |
564 | * Matcher.find(startIndex) in Java. |
565 | * |
566 | * If startIndex == -1 the search begins at the start of the input region, |
567 | * or at the start of the full string if no region has been specified. |
568 | * |
569 | * If a match is found, <code>uregex_start(), uregex_end()</code>, and |
570 | * <code>uregex_group()</code> will provide more information regarding the match. |
571 | * |
572 | * @param regexp The compiled regular expression. |
573 | * @param startIndex The position (native) in the input string to begin the search, or |
574 | * -1 to search within the Input Region. |
575 | * @param status A reference to a UErrorCode to receive any errors. |
576 | * @return TRUE if a match is found. |
577 | * @stable ICU 4.6 |
578 | */ |
579 | U_STABLE UBool U_EXPORT2 |
580 | uregex_find64(URegularExpression *regexp, |
581 | int64_t startIndex, |
582 | UErrorCode *status); |
583 | |
584 | /** |
585 | * Find the next pattern match in the input string. Begin searching |
586 | * the input at the location following the end of he previous match, |
587 | * or at the start of the string (or region) if there is no |
588 | * previous match. If a match is found, <code>uregex_start(), uregex_end()</code>, and |
589 | * <code>uregex_group()</code> will provide more information regarding the match. |
590 | * |
591 | * @param regexp The compiled regular expression. |
592 | * @param status A reference to a UErrorCode to receive any errors. |
593 | * @return TRUE if a match is found. |
594 | * @see uregex_reset |
595 | * @stable ICU 3.0 |
596 | */ |
597 | U_STABLE UBool U_EXPORT2 |
598 | uregex_findNext(URegularExpression *regexp, |
599 | UErrorCode *status); |
600 | |
601 | /** |
602 | * Get the number of capturing groups in this regular expression's pattern. |
603 | * @param regexp The compiled regular expression. |
604 | * @param status A reference to a UErrorCode to receive any errors. |
605 | * @return the number of capture groups |
606 | * @stable ICU 3.0 |
607 | */ |
608 | U_STABLE int32_t U_EXPORT2 |
609 | uregex_groupCount(URegularExpression *regexp, |
610 | UErrorCode *status); |
611 | |
612 | /** |
613 | * Get the group number corresponding to a named capture group. |
614 | * The returned number can be used with any function that access |
615 | * capture groups by number. |
616 | * |
617 | * The function returns an error status if the specified name does not |
618 | * appear in the pattern. |
619 | * |
620 | * @param regexp The compiled regular expression. |
621 | * @param groupName The capture group name. |
622 | * @param nameLength The length of the name, or -1 if the name is a |
623 | * nul-terminated string. |
624 | * @param status A pointer to a UErrorCode to receive any errors. |
625 | * |
626 | * @stable ICU 55 |
627 | */ |
628 | U_STABLE int32_t U_EXPORT2 |
629 | uregex_groupNumberFromName(URegularExpression *regexp, |
630 | const UChar *groupName, |
631 | int32_t nameLength, |
632 | UErrorCode *status); |
633 | |
634 | |
635 | /** |
636 | * Get the group number corresponding to a named capture group. |
637 | * The returned number can be used with any function that access |
638 | * capture groups by number. |
639 | * |
640 | * The function returns an error status if the specified name does not |
641 | * appear in the pattern. |
642 | * |
643 | * @param regexp The compiled regular expression. |
644 | * @param groupName The capture group name, |
645 | * platform invariant characters only. |
646 | * @param nameLength The length of the name, or -1 if the name is |
647 | * nul-terminated. |
648 | * @param status A pointer to a UErrorCode to receive any errors. |
649 | * |
650 | * @stable ICU 55 |
651 | */ |
652 | U_STABLE int32_t U_EXPORT2 |
653 | uregex_groupNumberFromCName(URegularExpression *regexp, |
654 | const char *groupName, |
655 | int32_t nameLength, |
656 | UErrorCode *status); |
657 | |
658 | /** Extract the string for the specified matching expression or subexpression. |
659 | * Group #0 is the complete string of matched text. |
660 | * Group #1 is the text matched by the first set of capturing parentheses. |
661 | * |
662 | * @param regexp The compiled regular expression. |
663 | * @param groupNum The capture group to extract. Group 0 is the complete |
664 | * match. The value of this parameter must be |
665 | * less than or equal to the number of capture groups in |
666 | * the pattern. |
667 | * @param dest Buffer to receive the matching string data |
668 | * @param destCapacity Capacity of the dest buffer. |
669 | * @param status A reference to a UErrorCode to receive any errors. |
670 | * @return Length of matching data, |
671 | * or -1 if no applicable match. |
672 | * @stable ICU 3.0 |
673 | */ |
674 | U_STABLE int32_t U_EXPORT2 |
675 | uregex_group(URegularExpression *regexp, |
676 | int32_t groupNum, |
677 | UChar *dest, |
678 | int32_t destCapacity, |
679 | UErrorCode *status); |
680 | |
681 | /** Returns a shallow immutable clone of the entire input string with the current index set |
682 | * to the beginning of the requested capture group. The capture group length is also |
683 | * returned via groupLength. |
684 | * Group #0 is the complete string of matched text. |
685 | * Group #1 is the text matched by the first set of capturing parentheses. |
686 | * |
687 | * @param regexp The compiled regular expression. |
688 | * @param groupNum The capture group to extract. Group 0 is the complete |
689 | * match. The value of this parameter must be |
690 | * less than or equal to the number of capture groups in |
691 | * the pattern. |
692 | * @param dest A mutable UText in which to store the current input. |
693 | * If NULL, a new UText will be created as an immutable shallow clone |
694 | * of the entire input string. |
695 | * @param groupLength The group length of the desired capture group. Output parameter. |
696 | * @param status A reference to a UErrorCode to receive any errors. |
697 | * @return The subject text currently associated with this regular expression. |
698 | * If a pre-allocated UText was provided, it will always be used and returned. |
699 | |
700 | * |
701 | * @stable ICU 4.6 |
702 | */ |
703 | U_STABLE UText * U_EXPORT2 |
704 | uregex_groupUText(URegularExpression *regexp, |
705 | int32_t groupNum, |
706 | UText *dest, |
707 | int64_t *groupLength, |
708 | UErrorCode *status); |
709 | |
710 | /** |
711 | * Returns the index in the input string of the start of the text matched by the |
712 | * specified capture group during the previous match operation. Return -1 if |
713 | * the capture group was not part of the last match. |
714 | * Group #0 refers to the complete range of matched text. |
715 | * Group #1 refers to the text matched by the first set of capturing parentheses. |
716 | * |
717 | * @param regexp The compiled regular expression. |
718 | * @param groupNum The capture group number |
719 | * @param status A reference to a UErrorCode to receive any errors. |
720 | * @return the starting (native) position in the input of the text matched |
721 | * by the specified group. |
722 | * @stable ICU 3.0 |
723 | */ |
724 | U_STABLE int32_t U_EXPORT2 |
725 | uregex_start(URegularExpression *regexp, |
726 | int32_t groupNum, |
727 | UErrorCode *status); |
728 | |
729 | /** |
730 | * 64bit version of uregex_start. |
731 | * Returns the index in the input string of the start of the text matched by the |
732 | * specified capture group during the previous match operation. Return -1 if |
733 | * the capture group was not part of the last match. |
734 | * Group #0 refers to the complete range of matched text. |
735 | * Group #1 refers to the text matched by the first set of capturing parentheses. |
736 | * |
737 | * @param regexp The compiled regular expression. |
738 | * @param groupNum The capture group number |
739 | * @param status A reference to a UErrorCode to receive any errors. |
740 | * @return the starting (native) position in the input of the text matched |
741 | * by the specified group. |
742 | * @stable ICU 4.6 |
743 | */ |
744 | U_STABLE int64_t U_EXPORT2 |
745 | uregex_start64(URegularExpression *regexp, |
746 | int32_t groupNum, |
747 | UErrorCode *status); |
748 | |
749 | /** |
750 | * Returns the index in the input string of the position following the end |
751 | * of the text matched by the specified capture group. |
752 | * Return -1 if the capture group was not part of the last match. |
753 | * Group #0 refers to the complete range of matched text. |
754 | * Group #1 refers to the text matched by the first set of capturing parentheses. |
755 | * |
756 | * @param regexp The compiled regular expression. |
757 | * @param groupNum The capture group number |
758 | * @param status A reference to a UErrorCode to receive any errors. |
759 | * @return the (native) index of the position following the last matched character. |
760 | * @stable ICU 3.0 |
761 | */ |
762 | U_STABLE int32_t U_EXPORT2 |
763 | uregex_end(URegularExpression *regexp, |
764 | int32_t groupNum, |
765 | UErrorCode *status); |
766 | |
767 | /** |
768 | * 64bit version of uregex_end. |
769 | * Returns the index in the input string of the position following the end |
770 | * of the text matched by the specified capture group. |
771 | * Return -1 if the capture group was not part of the last match. |
772 | * Group #0 refers to the complete range of matched text. |
773 | * Group #1 refers to the text matched by the first set of capturing parentheses. |
774 | * |
775 | * @param regexp The compiled regular expression. |
776 | * @param groupNum The capture group number |
777 | * @param status A reference to a UErrorCode to receive any errors. |
778 | * @return the (native) index of the position following the last matched character. |
779 | * @stable ICU 4.6 |
780 | */ |
781 | U_STABLE int64_t U_EXPORT2 |
782 | uregex_end64(URegularExpression *regexp, |
783 | int32_t groupNum, |
784 | UErrorCode *status); |
785 | |
786 | /** |
787 | * Reset any saved state from the previous match. Has the effect of |
788 | * causing uregex_findNext to begin at the specified index, and causing |
789 | * uregex_start(), uregex_end() and uregex_group() to return an error |
790 | * indicating that there is no match information available. Clears any |
791 | * match region that may have been set. |
792 | * |
793 | * @param regexp The compiled regular expression. |
794 | * @param index The position (native) in the text at which a |
795 | * uregex_findNext() should begin searching. |
796 | * @param status A reference to a UErrorCode to receive any errors. |
797 | * @stable ICU 3.0 |
798 | */ |
799 | U_STABLE void U_EXPORT2 |
800 | uregex_reset(URegularExpression *regexp, |
801 | int32_t index, |
802 | UErrorCode *status); |
803 | |
804 | /** |
805 | * 64bit version of uregex_reset. |
806 | * Reset any saved state from the previous match. Has the effect of |
807 | * causing uregex_findNext to begin at the specified index, and causing |
808 | * uregex_start(), uregex_end() and uregex_group() to return an error |
809 | * indicating that there is no match information available. Clears any |
810 | * match region that may have been set. |
811 | * |
812 | * @param regexp The compiled regular expression. |
813 | * @param index The position (native) in the text at which a |
814 | * uregex_findNext() should begin searching. |
815 | * @param status A reference to a UErrorCode to receive any errors. |
816 | * @stable ICU 4.6 |
817 | */ |
818 | U_STABLE void U_EXPORT2 |
819 | uregex_reset64(URegularExpression *regexp, |
820 | int64_t index, |
821 | UErrorCode *status); |
822 | |
823 | /** |
824 | * Sets the limits of the matching region for this URegularExpression. |
825 | * The region is the part of the input string that will be considered when matching. |
826 | * Invoking this method resets any saved state from the previous match, |
827 | * then sets the region to start at the index specified by the start parameter |
828 | * and end at the index specified by the end parameter. |
829 | * |
830 | * Depending on the transparency and anchoring being used (see useTransparentBounds |
831 | * and useAnchoringBounds), certain constructs such as anchors may behave differently |
832 | * at or around the boundaries of the region |
833 | * |
834 | * The function will fail if start is greater than limit, or if either index |
835 | * is less than zero or greater than the length of the string being matched. |
836 | * |
837 | * @param regexp The compiled regular expression. |
838 | * @param regionStart The (native) index to begin searches at. |
839 | * @param regionLimit The (native) index to end searches at (exclusive). |
840 | * @param status A pointer to a UErrorCode to receive any errors. |
841 | * @stable ICU 4.0 |
842 | */ |
843 | U_STABLE void U_EXPORT2 |
844 | uregex_setRegion(URegularExpression *regexp, |
845 | int32_t regionStart, |
846 | int32_t regionLimit, |
847 | UErrorCode *status); |
848 | |
849 | /** |
850 | * 64bit version of uregex_setRegion. |
851 | * Sets the limits of the matching region for this URegularExpression. |
852 | * The region is the part of the input string that will be considered when matching. |
853 | * Invoking this method resets any saved state from the previous match, |
854 | * then sets the region to start at the index specified by the start parameter |
855 | * and end at the index specified by the end parameter. |
856 | * |
857 | * Depending on the transparency and anchoring being used (see useTransparentBounds |
858 | * and useAnchoringBounds), certain constructs such as anchors may behave differently |
859 | * at or around the boundaries of the region |
860 | * |
861 | * The function will fail if start is greater than limit, or if either index |
862 | * is less than zero or greater than the length of the string being matched. |
863 | * |
864 | * @param regexp The compiled regular expression. |
865 | * @param regionStart The (native) index to begin searches at. |
866 | * @param regionLimit The (native) index to end searches at (exclusive). |
867 | * @param status A pointer to a UErrorCode to receive any errors. |
868 | * @stable ICU 4.6 |
869 | */ |
870 | U_STABLE void U_EXPORT2 |
871 | uregex_setRegion64(URegularExpression *regexp, |
872 | int64_t regionStart, |
873 | int64_t regionLimit, |
874 | UErrorCode *status); |
875 | |
876 | /** |
877 | * Set the matching region and the starting index for subsequent matches |
878 | * in a single operation. |
879 | * This is useful because the usual function for setting the starting |
880 | * index, urgex_reset(), also resets any region limits. |
881 | * |
882 | * @param regexp The compiled regular expression. |
883 | * @param regionStart The (native) index to begin searches at. |
884 | * @param regionLimit The (native) index to end searches at (exclusive). |
885 | * @param startIndex The index in the input text at which the next |
886 | * match operation should begin. |
887 | * @param status A pointer to a UErrorCode to receive any errors. |
888 | * @stable ICU 4.6 |
889 | */ |
890 | U_STABLE void U_EXPORT2 |
891 | uregex_setRegionAndStart(URegularExpression *regexp, |
892 | int64_t regionStart, |
893 | int64_t regionLimit, |
894 | int64_t startIndex, |
895 | UErrorCode *status); |
896 | |
897 | /** |
898 | * Reports the start index of the matching region. Any matches found are limited to |
899 | * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). |
900 | * |
901 | * @param regexp The compiled regular expression. |
902 | * @param status A pointer to a UErrorCode to receive any errors. |
903 | * @return The starting (native) index of this matcher's region. |
904 | * @stable ICU 4.0 |
905 | */ |
906 | U_STABLE int32_t U_EXPORT2 |
907 | uregex_regionStart(const URegularExpression *regexp, |
908 | UErrorCode *status); |
909 | |
910 | /** |
911 | * 64bit version of uregex_regionStart. |
912 | * Reports the start index of the matching region. Any matches found are limited to |
913 | * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). |
914 | * |
915 | * @param regexp The compiled regular expression. |
916 | * @param status A pointer to a UErrorCode to receive any errors. |
917 | * @return The starting (native) index of this matcher's region. |
918 | * @stable ICU 4.6 |
919 | */ |
920 | U_STABLE int64_t U_EXPORT2 |
921 | uregex_regionStart64(const URegularExpression *regexp, |
922 | UErrorCode *status); |
923 | |
924 | /** |
925 | * Reports the end index (exclusive) of the matching region for this URegularExpression. |
926 | * Any matches found are limited to to the region bounded by regionStart (inclusive) |
927 | * and regionEnd (exclusive). |
928 | * |
929 | * @param regexp The compiled regular expression. |
930 | * @param status A pointer to a UErrorCode to receive any errors. |
931 | * @return The ending point (native) of this matcher's region. |
932 | * @stable ICU 4.0 |
933 | */ |
934 | U_STABLE int32_t U_EXPORT2 |
935 | uregex_regionEnd(const URegularExpression *regexp, |
936 | UErrorCode *status); |
937 | |
938 | /** |
939 | * 64bit version of uregex_regionEnd. |
940 | * Reports the end index (exclusive) of the matching region for this URegularExpression. |
941 | * Any matches found are limited to to the region bounded by regionStart (inclusive) |
942 | * and regionEnd (exclusive). |
943 | * |
944 | * @param regexp The compiled regular expression. |
945 | * @param status A pointer to a UErrorCode to receive any errors. |
946 | * @return The ending point (native) of this matcher's region. |
947 | * @stable ICU 4.6 |
948 | */ |
949 | U_STABLE int64_t U_EXPORT2 |
950 | uregex_regionEnd64(const URegularExpression *regexp, |
951 | UErrorCode *status); |
952 | |
953 | /** |
954 | * Queries the transparency of region bounds for this URegularExpression. |
955 | * See useTransparentBounds for a description of transparent and opaque bounds. |
956 | * By default, matching boundaries are opaque. |
957 | * |
958 | * @param regexp The compiled regular expression. |
959 | * @param status A pointer to a UErrorCode to receive any errors. |
960 | * @return TRUE if this matcher is using opaque bounds, false if it is not. |
961 | * @stable ICU 4.0 |
962 | */ |
963 | U_STABLE UBool U_EXPORT2 |
964 | uregex_hasTransparentBounds(const URegularExpression *regexp, |
965 | UErrorCode *status); |
966 | |
967 | |
968 | /** |
969 | * Sets the transparency of region bounds for this URegularExpression. |
970 | * Invoking this function with an argument of TRUE will set matches to use transparent bounds. |
971 | * If the boolean argument is FALSE, then opaque bounds will be used. |
972 | * |
973 | * Using transparent bounds, the boundaries of the matching region are transparent |
974 | * to lookahead, lookbehind, and boundary matching constructs. Those constructs can |
975 | * see text beyond the boundaries of the region while checking for a match. |
976 | * |
977 | * With opaque bounds, no text outside of the matching region is visible to lookahead, |
978 | * lookbehind, and boundary matching constructs. |
979 | * |
980 | * By default, opaque bounds are used. |
981 | * |
982 | * @param regexp The compiled regular expression. |
983 | * @param b TRUE for transparent bounds; FALSE for opaque bounds |
984 | * @param status A pointer to a UErrorCode to receive any errors. |
985 | * @stable ICU 4.0 |
986 | **/ |
987 | U_STABLE void U_EXPORT2 |
988 | uregex_useTransparentBounds(URegularExpression *regexp, |
989 | UBool b, |
990 | UErrorCode *status); |
991 | |
992 | |
993 | /** |
994 | * Return true if this URegularExpression is using anchoring bounds. |
995 | * By default, anchoring region bounds are used. |
996 | * |
997 | * @param regexp The compiled regular expression. |
998 | * @param status A pointer to a UErrorCode to receive any errors. |
999 | * @return TRUE if this matcher is using anchoring bounds. |
1000 | * @stable ICU 4.0 |
1001 | */ |
1002 | U_STABLE UBool U_EXPORT2 |
1003 | uregex_hasAnchoringBounds(const URegularExpression *regexp, |
1004 | UErrorCode *status); |
1005 | |
1006 | |
1007 | /** |
1008 | * Set whether this URegularExpression is using Anchoring Bounds for its region. |
1009 | * With anchoring bounds, pattern anchors such as ^ and $ will match at the start |
1010 | * and end of the region. Without Anchoring Bounds, anchors will only match at |
1011 | * the positions they would in the complete text. |
1012 | * |
1013 | * Anchoring Bounds are the default for regions. |
1014 | * |
1015 | * @param regexp The compiled regular expression. |
1016 | * @param b TRUE if to enable anchoring bounds; FALSE to disable them. |
1017 | * @param status A pointer to a UErrorCode to receive any errors. |
1018 | * @stable ICU 4.0 |
1019 | */ |
1020 | U_STABLE void U_EXPORT2 |
1021 | uregex_useAnchoringBounds(URegularExpression *regexp, |
1022 | UBool b, |
1023 | UErrorCode *status); |
1024 | |
1025 | /** |
1026 | * Return TRUE if the most recent matching operation touched the |
1027 | * end of the text being processed. In this case, additional input text could |
1028 | * change the results of that match. |
1029 | * |
1030 | * @param regexp The compiled regular expression. |
1031 | * @param status A pointer to a UErrorCode to receive any errors. |
1032 | * @return TRUE if the most recent match hit the end of input |
1033 | * @stable ICU 4.0 |
1034 | */ |
1035 | U_STABLE UBool U_EXPORT2 |
1036 | uregex_hitEnd(const URegularExpression *regexp, |
1037 | UErrorCode *status); |
1038 | |
1039 | /** |
1040 | * Return TRUE the most recent match succeeded and additional input could cause |
1041 | * it to fail. If this function returns false and a match was found, then more input |
1042 | * might change the match but the match won't be lost. If a match was not found, |
1043 | * then requireEnd has no meaning. |
1044 | * |
1045 | * @param regexp The compiled regular expression. |
1046 | * @param status A pointer to a UErrorCode to receive any errors. |
1047 | * @return TRUE if more input could cause the most recent match to no longer match. |
1048 | * @stable ICU 4.0 |
1049 | */ |
1050 | U_STABLE UBool U_EXPORT2 |
1051 | uregex_requireEnd(const URegularExpression *regexp, |
1052 | UErrorCode *status); |
1053 | |
1054 | |
1055 | |
1056 | |
1057 | |
1058 | /** |
1059 | * Replaces every substring of the input that matches the pattern |
1060 | * with the given replacement string. This is a convenience function that |
1061 | * provides a complete find-and-replace-all operation. |
1062 | * |
1063 | * This method scans the input string looking for matches of the pattern. |
1064 | * Input that is not part of any match is copied unchanged to the |
1065 | * destination buffer. Matched regions are replaced in the output |
1066 | * buffer by the replacement string. The replacement string may contain |
1067 | * references to capture groups; these take the form of $1, $2, etc. |
1068 | * |
1069 | * @param regexp The compiled regular expression. |
1070 | * @param replacementText A string containing the replacement text. |
1071 | * @param replacementLength The length of the replacement string, or |
1072 | * -1 if it is NUL terminated. |
1073 | * @param destBuf A (UChar *) buffer that will receive the result. |
1074 | * @param destCapacity The capacity of the destination buffer. |
1075 | * @param status A reference to a UErrorCode to receive any errors. |
1076 | * @return The length of the string resulting from the find |
1077 | * and replace operation. In the event that the |
1078 | * destination capacity is inadequate, the return value |
1079 | * is still the full length of the untruncated string. |
1080 | * @stable ICU 3.0 |
1081 | */ |
1082 | U_STABLE int32_t U_EXPORT2 |
1083 | uregex_replaceAll(URegularExpression *regexp, |
1084 | const UChar *replacementText, |
1085 | int32_t replacementLength, |
1086 | UChar *destBuf, |
1087 | int32_t destCapacity, |
1088 | UErrorCode *status); |
1089 | |
1090 | /** |
1091 | * Replaces every substring of the input that matches the pattern |
1092 | * with the given replacement string. This is a convenience function that |
1093 | * provides a complete find-and-replace-all operation. |
1094 | * |
1095 | * This method scans the input string looking for matches of the pattern. |
1096 | * Input that is not part of any match is copied unchanged to the |
1097 | * destination buffer. Matched regions are replaced in the output |
1098 | * buffer by the replacement string. The replacement string may contain |
1099 | * references to capture groups; these take the form of $1, $2, etc. |
1100 | * |
1101 | * @param regexp The compiled regular expression. |
1102 | * @param replacement A string containing the replacement text. |
1103 | * @param dest A mutable UText that will receive the result. |
1104 | * If NULL, a new UText will be created (which may not be mutable). |
1105 | * @param status A reference to a UErrorCode to receive any errors. |
1106 | * @return A UText containing the results of the find and replace. |
1107 | * If a pre-allocated UText was provided, it will always be used and returned. |
1108 | * |
1109 | * @stable ICU 4.6 |
1110 | */ |
1111 | U_STABLE UText * U_EXPORT2 |
1112 | uregex_replaceAllUText(URegularExpression *regexp, |
1113 | UText *replacement, |
1114 | UText *dest, |
1115 | UErrorCode *status); |
1116 | |
1117 | /** |
1118 | * Replaces the first substring of the input that matches the pattern |
1119 | * with the given replacement string. This is a convenience function that |
1120 | * provides a complete find-and-replace operation. |
1121 | * |
1122 | * This method scans the input string looking for a match of the pattern. |
1123 | * All input that is not part of the match is copied unchanged to the |
1124 | * destination buffer. The matched region is replaced in the output |
1125 | * buffer by the replacement string. The replacement string may contain |
1126 | * references to capture groups; these take the form of $1, $2, etc. |
1127 | * |
1128 | * @param regexp The compiled regular expression. |
1129 | * @param replacementText A string containing the replacement text. |
1130 | * @param replacementLength The length of the replacement string, or |
1131 | * -1 if it is NUL terminated. |
1132 | * @param destBuf A (UChar *) buffer that will receive the result. |
1133 | * @param destCapacity The capacity of the destination buffer. |
1134 | * @param status a reference to a UErrorCode to receive any errors. |
1135 | * @return The length of the string resulting from the find |
1136 | * and replace operation. In the event that the |
1137 | * destination capacity is inadequate, the return value |
1138 | * is still the full length of the untruncated string. |
1139 | * @stable ICU 3.0 |
1140 | */ |
1141 | U_STABLE int32_t U_EXPORT2 |
1142 | uregex_replaceFirst(URegularExpression *regexp, |
1143 | const UChar *replacementText, |
1144 | int32_t replacementLength, |
1145 | UChar *destBuf, |
1146 | int32_t destCapacity, |
1147 | UErrorCode *status); |
1148 | |
1149 | /** |
1150 | * Replaces the first substring of the input that matches the pattern |
1151 | * with the given replacement string. This is a convenience function that |
1152 | * provides a complete find-and-replace operation. |
1153 | * |
1154 | * This method scans the input string looking for a match of the pattern. |
1155 | * All input that is not part of the match is copied unchanged to the |
1156 | * destination buffer. The matched region is replaced in the output |
1157 | * buffer by the replacement string. The replacement string may contain |
1158 | * references to capture groups; these take the form of $1, $2, etc. |
1159 | * |
1160 | * @param regexp The compiled regular expression. |
1161 | * @param replacement A string containing the replacement text. |
1162 | * @param dest A mutable UText that will receive the result. |
1163 | * If NULL, a new UText will be created (which may not be mutable). |
1164 | * @param status A reference to a UErrorCode to receive any errors. |
1165 | * @return A UText containing the results of the find and replace. |
1166 | * If a pre-allocated UText was provided, it will always be used and returned. |
1167 | * |
1168 | * @stable ICU 4.6 |
1169 | */ |
1170 | U_STABLE UText * U_EXPORT2 |
1171 | uregex_replaceFirstUText(URegularExpression *regexp, |
1172 | UText *replacement, |
1173 | UText *dest, |
1174 | UErrorCode *status); |
1175 | |
1176 | /** |
1177 | * Implements a replace operation intended to be used as part of an |
1178 | * incremental find-and-replace. |
1179 | * |
1180 | * <p>The input string, starting from the end of the previous match and ending at |
1181 | * the start of the current match, is appended to the destination string. Then the |
1182 | * replacement string is appended to the output string, |
1183 | * including handling any substitutions of captured text.</p> |
1184 | * |
1185 | * <p>A note on preflight computation of buffersize and error handling: |
1186 | * Calls to uregex_appendReplacement() and uregex_appendTail() are |
1187 | * designed to be chained, one after another, with the destination |
1188 | * buffer pointer and buffer capacity updated after each in preparation |
1189 | * to for the next. If the destination buffer is exhausted partway through such a |
1190 | * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal |
1191 | * ICU conventions are for a function to perform no action if it is |
1192 | * called with an error status, but for this one case, uregex_appendRepacement() |
1193 | * will operate normally so that buffer size computations will complete |
1194 | * correctly. |
1195 | * |
1196 | * <p>For simple, prepackaged, non-incremental find-and-replace |
1197 | * operations, see replaceFirst() or replaceAll().</p> |
1198 | * |
1199 | * @param regexp The regular expression object. |
1200 | * @param replacementText The string that will replace the matched portion of the |
1201 | * input string as it is copied to the destination buffer. |
1202 | * The replacement text may contain references ($1, for |
1203 | * example) to capture groups from the match. |
1204 | * @param replacementLength The length of the replacement text string, |
1205 | * or -1 if the string is NUL terminated. |
1206 | * @param destBuf The buffer into which the results of the |
1207 | * find-and-replace are placed. On return, this pointer |
1208 | * will be updated to refer to the beginning of the |
1209 | * unused portion of buffer, leaving it in position for |
1210 | * a subsequent call to this function. |
1211 | * @param destCapacity The size of the output buffer, On return, this |
1212 | * parameter will be updated to reflect the space remaining |
1213 | * unused in the output buffer. |
1214 | * @param status A reference to a UErrorCode to receive any errors. |
1215 | * @return The length of the result string. In the event that |
1216 | * destCapacity is inadequate, the full length of the |
1217 | * untruncated output string is returned. |
1218 | * |
1219 | * @stable ICU 3.0 |
1220 | * |
1221 | */ |
1222 | U_STABLE int32_t U_EXPORT2 |
1223 | uregex_appendReplacement(URegularExpression *regexp, |
1224 | const UChar *replacementText, |
1225 | int32_t replacementLength, |
1226 | UChar **destBuf, |
1227 | int32_t *destCapacity, |
1228 | UErrorCode *status); |
1229 | |
1230 | /** |
1231 | * Implements a replace operation intended to be used as part of an |
1232 | * incremental find-and-replace. |
1233 | * |
1234 | * <p>The input string, starting from the end of the previous match and ending at |
1235 | * the start of the current match, is appended to the destination string. Then the |
1236 | * replacement string is appended to the output string, |
1237 | * including handling any substitutions of captured text.</p> |
1238 | * |
1239 | * <p>For simple, prepackaged, non-incremental find-and-replace |
1240 | * operations, see replaceFirst() or replaceAll().</p> |
1241 | * |
1242 | * @param regexp The regular expression object. |
1243 | * @param replacementText The string that will replace the matched portion of the |
1244 | * input string as it is copied to the destination buffer. |
1245 | * The replacement text may contain references ($1, for |
1246 | * example) to capture groups from the match. |
1247 | * @param dest A mutable UText that will receive the result. Must not be NULL. |
1248 | * @param status A reference to a UErrorCode to receive any errors. |
1249 | * |
1250 | * @stable ICU 4.6 |
1251 | */ |
1252 | U_STABLE void U_EXPORT2 |
1253 | uregex_appendReplacementUText(URegularExpression *regexp, |
1254 | UText *replacementText, |
1255 | UText *dest, |
1256 | UErrorCode *status); |
1257 | |
1258 | /** |
1259 | * As the final step in a find-and-replace operation, append the remainder |
1260 | * of the input string, starting at the position following the last match, |
1261 | * to the destination string. <code>uregex_appendTail()</code> is intended |
1262 | * to be invoked after one or more invocations of the |
1263 | * <code>uregex_appendReplacement()</code> function. |
1264 | * |
1265 | * @param regexp The regular expression object. This is needed to |
1266 | * obtain the input string and with the position |
1267 | * of the last match within it. |
1268 | * @param destBuf The buffer in which the results of the |
1269 | * find-and-replace are placed. On return, the pointer |
1270 | * will be updated to refer to the beginning of the |
1271 | * unused portion of buffer. |
1272 | * @param destCapacity The size of the output buffer, On return, this |
1273 | * value will be updated to reflect the space remaining |
1274 | * unused in the output buffer. |
1275 | * @param status A reference to a UErrorCode to receive any errors. |
1276 | * @return The length of the result string. In the event that |
1277 | * destCapacity is inadequate, the full length of the |
1278 | * untruncated output string is returned. |
1279 | * |
1280 | * @stable ICU 3.0 |
1281 | */ |
1282 | U_STABLE int32_t U_EXPORT2 |
1283 | uregex_appendTail(URegularExpression *regexp, |
1284 | UChar **destBuf, |
1285 | int32_t *destCapacity, |
1286 | UErrorCode *status); |
1287 | |
1288 | /** |
1289 | * As the final step in a find-and-replace operation, append the remainder |
1290 | * of the input string, starting at the position following the last match, |
1291 | * to the destination string. <code>uregex_appendTailUText()</code> is intended |
1292 | * to be invoked after one or more invocations of the |
1293 | * <code>uregex_appendReplacementUText()</code> function. |
1294 | * |
1295 | * @param regexp The regular expression object. This is needed to |
1296 | * obtain the input string and with the position |
1297 | * of the last match within it. |
1298 | * @param dest A mutable UText that will receive the result. Must not be NULL. |
1299 | * |
1300 | * @param status Error code |
1301 | * |
1302 | * @return The destination UText. |
1303 | * |
1304 | * @stable ICU 4.6 |
1305 | */ |
1306 | U_STABLE UText * U_EXPORT2 |
1307 | uregex_appendTailUText(URegularExpression *regexp, |
1308 | UText *dest, |
1309 | UErrorCode *status); |
1310 | |
1311 | /** |
1312 | * Split a string into fields. Somewhat like split() from Perl. |
1313 | * The pattern matches identify delimiters that separate the input |
1314 | * into fields. The input data between the matches becomes the |
1315 | * fields themselves. |
1316 | * |
1317 | * Each of the fields is copied from the input string to the destination |
1318 | * buffer, and NUL terminated. The position of each field within |
1319 | * the destination buffer is returned in the destFields array. |
1320 | * |
1321 | * If the delimiter pattern includes capture groups, the captured text will |
1322 | * also appear in the destination array of output strings, interspersed |
1323 | * with the fields. This is similar to Perl, but differs from Java, |
1324 | * which ignores the presence of capture groups in the pattern. |
1325 | * |
1326 | * Trailing empty fields will always be returned, assuming sufficient |
1327 | * destination capacity. This differs from the default behavior for Java |
1328 | * and Perl where trailing empty fields are not returned. |
1329 | * |
1330 | * The number of strings produced by the split operation is returned. |
1331 | * This count includes the strings from capture groups in the delimiter pattern. |
1332 | * This behavior differs from Java, which ignores capture groups. |
1333 | * |
1334 | * @param regexp The compiled regular expression. |
1335 | * @param destBuf A (UChar *) buffer to receive the fields that |
1336 | * are extracted from the input string. These |
1337 | * field pointers will refer to positions within the |
1338 | * destination buffer supplied by the caller. Any |
1339 | * extra positions within the destFields array will be |
1340 | * set to NULL. |
1341 | * @param destCapacity The capacity of the destBuf. |
1342 | * @param requiredCapacity The actual capacity required of the destBuf. |
1343 | * If destCapacity is too small, requiredCapacity will return |
1344 | * the total capacity required to hold all of the output, and |
1345 | * a U_BUFFER_OVERFLOW_ERROR will be returned. |
1346 | * @param destFields An array to be filled with the position of each |
1347 | * of the extracted fields within destBuf. |
1348 | * @param destFieldsCapacity The number of elements in the destFields array. |
1349 | * If the number of fields found is less than destFieldsCapacity, |
1350 | * the extra destFields elements are set to zero. |
1351 | * If destFieldsCapacity is too small, the trailing part of the |
1352 | * input, including any field delimiters, is treated as if it |
1353 | * were the last field - it is copied to the destBuf, and |
1354 | * its position is in the destBuf is stored in the last element |
1355 | * of destFields. This behavior mimics that of Perl. It is not |
1356 | * an error condition, and no error status is returned when all destField |
1357 | * positions are used. |
1358 | * @param status A reference to a UErrorCode to receive any errors. |
1359 | * @return The number of fields into which the input string was split. |
1360 | * @stable ICU 3.0 |
1361 | */ |
1362 | U_STABLE int32_t U_EXPORT2 |
1363 | uregex_split( URegularExpression *regexp, |
1364 | UChar *destBuf, |
1365 | int32_t destCapacity, |
1366 | int32_t *requiredCapacity, |
1367 | UChar *destFields[], |
1368 | int32_t destFieldsCapacity, |
1369 | UErrorCode *status); |
1370 | |
1371 | /** |
1372 | * Split a string into fields. Somewhat like split() from Perl. |
1373 | * The pattern matches identify delimiters that separate the input |
1374 | * into fields. The input data between the matches becomes the |
1375 | * fields themselves. |
1376 | * <p> |
1377 | * The behavior of this function is not very closely aligned with uregex_split(); |
1378 | * instead, it is based on (and implemented directly on top of) the C++ split method. |
1379 | * |
1380 | * @param regexp The compiled regular expression. |
1381 | * @param destFields An array of mutable UText structs to receive the results of the split. |
1382 | * If a field is NULL, a new UText is allocated to contain the results for |
1383 | * that field. This new UText is not guaranteed to be mutable. |
1384 | * @param destFieldsCapacity The number of elements in the destination array. |
1385 | * If the number of fields found is less than destCapacity, the |
1386 | * extra strings in the destination array are not altered. |
1387 | * If the number of destination strings is less than the number |
1388 | * of fields, the trailing part of the input string, including any |
1389 | * field delimiters, is placed in the last destination string. |
1390 | * This behavior mimics that of Perl. It is not an error condition, and no |
1391 | * error status is returned when all destField positions are used. |
1392 | * @param status A reference to a UErrorCode to receive any errors. |
1393 | * @return The number of fields into which the input string was split. |
1394 | * |
1395 | * @stable ICU 4.6 |
1396 | */ |
1397 | U_STABLE int32_t U_EXPORT2 |
1398 | uregex_splitUText(URegularExpression *regexp, |
1399 | UText *destFields[], |
1400 | int32_t destFieldsCapacity, |
1401 | UErrorCode *status); |
1402 | |
1403 | /** |
1404 | * Set a processing time limit for match operations with this URegularExpression. |
1405 | * |
1406 | * Some patterns, when matching certain strings, can run in exponential time. |
1407 | * For practical purposes, the match operation may appear to be in an |
1408 | * infinite loop. |
1409 | * When a limit is set a match operation will fail with an error if the |
1410 | * limit is exceeded. |
1411 | * <p> |
1412 | * The units of the limit are steps of the match engine. |
1413 | * Correspondence with actual processor time will depend on the speed |
1414 | * of the processor and the details of the specific pattern, but will |
1415 | * typically be on the order of milliseconds. |
1416 | * <p> |
1417 | * By default, the matching time is not limited. |
1418 | * <p> |
1419 | * |
1420 | * @param regexp The compiled regular expression. |
1421 | * @param limit The limit value, or 0 for no limit. |
1422 | * @param status A reference to a UErrorCode to receive any errors. |
1423 | * @stable ICU 4.0 |
1424 | */ |
1425 | U_STABLE void U_EXPORT2 |
1426 | uregex_setTimeLimit(URegularExpression *regexp, |
1427 | int32_t limit, |
1428 | UErrorCode *status); |
1429 | |
1430 | /** |
1431 | * Get the time limit for for matches with this URegularExpression. |
1432 | * A return value of zero indicates that there is no limit. |
1433 | * |
1434 | * @param regexp The compiled regular expression. |
1435 | * @param status A reference to a UErrorCode to receive any errors. |
1436 | * @return the maximum allowed time for a match, in units of processing steps. |
1437 | * @stable ICU 4.0 |
1438 | */ |
1439 | U_STABLE int32_t U_EXPORT2 |
1440 | uregex_getTimeLimit(const URegularExpression *regexp, |
1441 | UErrorCode *status); |
1442 | |
1443 | /** |
1444 | * Set the amount of heap storage available for use by the match backtracking stack. |
1445 | * <p> |
1446 | * ICU uses a backtracking regular expression engine, with the backtrack stack |
1447 | * maintained on the heap. This function sets the limit to the amount of memory |
1448 | * that can be used for this purpose. A backtracking stack overflow will |
1449 | * result in an error from the match operation that caused it. |
1450 | * <p> |
1451 | * A limit is desirable because a malicious or poorly designed pattern can use |
1452 | * excessive memory, potentially crashing the process. A limit is enabled |
1453 | * by default. |
1454 | * <p> |
1455 | * @param regexp The compiled regular expression. |
1456 | * @param limit The maximum size, in bytes, of the matching backtrack stack. |
1457 | * A value of zero means no limit. |
1458 | * The limit must be greater than or equal to zero. |
1459 | * @param status A reference to a UErrorCode to receive any errors. |
1460 | * |
1461 | * @stable ICU 4.0 |
1462 | */ |
1463 | U_STABLE void U_EXPORT2 |
1464 | uregex_setStackLimit(URegularExpression *regexp, |
1465 | int32_t limit, |
1466 | UErrorCode *status); |
1467 | |
1468 | /** |
1469 | * Get the size of the heap storage available for use by the back tracking stack. |
1470 | * |
1471 | * @return the maximum backtracking stack size, in bytes, or zero if the |
1472 | * stack size is unlimited. |
1473 | * @stable ICU 4.0 |
1474 | */ |
1475 | U_STABLE int32_t U_EXPORT2 |
1476 | uregex_getStackLimit(const URegularExpression *regexp, |
1477 | UErrorCode *status); |
1478 | |
1479 | |
1480 | /** |
1481 | * Function pointer for a regular expression matching callback function. |
1482 | * When set, a callback function will be called periodically during matching |
1483 | * operations. If the call back function returns FALSE, the matching |
1484 | * operation will be terminated early. |
1485 | * |
1486 | * Note: the callback function must not call other functions on this |
1487 | * URegularExpression. |
1488 | * |
1489 | * @param context context pointer. The callback function will be invoked |
1490 | * with the context specified at the time that |
1491 | * uregex_setMatchCallback() is called. |
1492 | * @param steps the accumulated processing time, in match steps, |
1493 | * for this matching operation. |
1494 | * @return TRUE to continue the matching operation. |
1495 | * FALSE to terminate the matching operation. |
1496 | * @stable ICU 4.0 |
1497 | */ |
1498 | U_CDECL_BEGIN |
1499 | typedef UBool U_CALLCONV URegexMatchCallback ( |
1500 | const void *context, |
1501 | int32_t steps); |
1502 | U_CDECL_END |
1503 | |
1504 | /** |
1505 | * Set a callback function for this URegularExpression. |
1506 | * During matching operations the function will be called periodically, |
1507 | * giving the application the opportunity to terminate a long-running |
1508 | * match. |
1509 | * |
1510 | * @param regexp The compiled regular expression. |
1511 | * @param callback A pointer to the user-supplied callback function. |
1512 | * @param context User context pointer. The value supplied at the |
1513 | * time the callback function is set will be saved |
1514 | * and passed to the callback each time that it is called. |
1515 | * @param status A reference to a UErrorCode to receive any errors. |
1516 | * @stable ICU 4.0 |
1517 | */ |
1518 | U_STABLE void U_EXPORT2 |
1519 | uregex_setMatchCallback(URegularExpression *regexp, |
1520 | URegexMatchCallback *callback, |
1521 | const void *context, |
1522 | UErrorCode *status); |
1523 | |
1524 | |
1525 | /** |
1526 | * Get the callback function for this URegularExpression. |
1527 | * |
1528 | * @param regexp The compiled regular expression. |
1529 | * @param callback Out parameter, receives a pointer to the user-supplied |
1530 | * callback function. |
1531 | * @param context Out parameter, receives the user context pointer that |
1532 | * was set when uregex_setMatchCallback() was called. |
1533 | * @param status A reference to a UErrorCode to receive any errors. |
1534 | * @stable ICU 4.0 |
1535 | */ |
1536 | U_STABLE void U_EXPORT2 |
1537 | uregex_getMatchCallback(const URegularExpression *regexp, |
1538 | URegexMatchCallback **callback, |
1539 | const void **context, |
1540 | UErrorCode *status); |
1541 | |
1542 | /** |
1543 | * Function pointer for a regular expression find callback function. |
1544 | * |
1545 | * When set, a callback function will be called during a find operation |
1546 | * and for operations that depend on find, such as findNext, split and some replace |
1547 | * operations like replaceFirst. |
1548 | * The callback will usually be called after each attempt at a match, but this is not a |
1549 | * guarantee that the callback will be invoked at each character. For finds where the |
1550 | * match engine is invoked at each character, this may be close to true, but less likely |
1551 | * for more optimized loops where the pattern is known to only start, and the match |
1552 | * engine invoked, at certain characters. |
1553 | * When invoked, this callback will specify the index at which a match operation is about |
1554 | * to be attempted, giving the application the opportunity to terminate a long-running |
1555 | * find operation. |
1556 | * |
1557 | * If the call back function returns FALSE, the find operation will be terminated early. |
1558 | * |
1559 | * Note: the callback function must not call other functions on this |
1560 | * URegularExpression |
1561 | * |
1562 | * @param context context pointer. The callback function will be invoked |
1563 | * with the context specified at the time that |
1564 | * uregex_setFindProgressCallback() is called. |
1565 | * @param matchIndex the next index at which a match attempt will be attempted for this |
1566 | * find operation. If this callback interrupts the search, this is the |
1567 | * index at which a find/findNext operation may be re-initiated. |
1568 | * @return TRUE to continue the matching operation. |
1569 | * FALSE to terminate the matching operation. |
1570 | * @stable ICU 4.6 |
1571 | */ |
1572 | U_CDECL_BEGIN |
1573 | typedef UBool U_CALLCONV URegexFindProgressCallback ( |
1574 | const void *context, |
1575 | int64_t matchIndex); |
1576 | U_CDECL_END |
1577 | |
1578 | |
1579 | /** |
1580 | * Set the find progress callback function for this URegularExpression. |
1581 | * |
1582 | * @param regexp The compiled regular expression. |
1583 | * @param callback A pointer to the user-supplied callback function. |
1584 | * @param context User context pointer. The value supplied at the |
1585 | * time the callback function is set will be saved |
1586 | * and passed to the callback each time that it is called. |
1587 | * @param status A reference to a UErrorCode to receive any errors. |
1588 | * @stable ICU 4.6 |
1589 | */ |
1590 | U_STABLE void U_EXPORT2 |
1591 | uregex_setFindProgressCallback(URegularExpression *regexp, |
1592 | URegexFindProgressCallback *callback, |
1593 | const void *context, |
1594 | UErrorCode *status); |
1595 | |
1596 | /** |
1597 | * Get the find progress callback function for this URegularExpression. |
1598 | * |
1599 | * @param regexp The compiled regular expression. |
1600 | * @param callback Out parameter, receives a pointer to the user-supplied |
1601 | * callback function. |
1602 | * @param context Out parameter, receives the user context pointer that |
1603 | * was set when uregex_setFindProgressCallback() was called. |
1604 | * @param status A reference to a UErrorCode to receive any errors. |
1605 | * @stable ICU 4.6 |
1606 | */ |
1607 | U_STABLE void U_EXPORT2 |
1608 | uregex_getFindProgressCallback(const URegularExpression *regexp, |
1609 | URegexFindProgressCallback **callback, |
1610 | const void **context, |
1611 | UErrorCode *status); |
1612 | |
1613 | #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ |
1614 | #endif /* UREGEX_H */ |
1615 | |