1 | /* |
2 | * Internal interface definitions, etc., for the reg package |
3 | * |
4 | * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. |
5 | * |
6 | * Development of this software was funded, in part, by Cray Research Inc., |
7 | * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics |
8 | * Corporation, none of whom are responsible for the results. The author |
9 | * thanks all of them. |
10 | * |
11 | * Redistribution and use in source and binary forms -- with or without |
12 | * modification -- are permitted for any purpose, provided that |
13 | * redistributions in source form retain this entire copyright notice and |
14 | * indicate the origin and nature of any modifications. |
15 | * |
16 | * I'd appreciate being given credit for this package in the documentation |
17 | * of software which uses it, but that is not a requirement. |
18 | * |
19 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, |
20 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY |
21 | * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL |
22 | * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
23 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
24 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; |
25 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
26 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR |
27 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF |
28 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | * |
30 | * src/include/regex/regguts.h |
31 | */ |
32 | |
33 | |
34 | |
35 | /* |
36 | * Environmental customization. It should not (I hope) be necessary to |
37 | * alter the file you are now reading -- regcustom.h should handle it all, |
38 | * given care here and elsewhere. |
39 | */ |
40 | #include "regcustom.h" |
41 | |
42 | |
43 | |
44 | /* |
45 | * Things that regcustom.h might override. |
46 | */ |
47 | |
48 | /* assertions */ |
49 | #ifndef assert |
50 | #ifndef REG_DEBUG |
51 | #define NDEBUG /* no assertions */ |
52 | #endif |
53 | #include <assert.h> |
54 | #endif |
55 | |
56 | /* voids */ |
57 | #ifndef DISCARD |
58 | #define DISCARD void /* for throwing values away */ |
59 | #endif |
60 | #ifndef VS |
61 | #define VS(x) ((void *)(x)) /* cast something to generic ptr */ |
62 | #endif |
63 | |
64 | /* function-pointer declarator */ |
65 | #ifndef FUNCPTR |
66 | #define FUNCPTR(name, args) (*(name)) args |
67 | #endif |
68 | |
69 | /* memory allocation */ |
70 | #ifndef MALLOC |
71 | #define MALLOC(n) malloc(n) |
72 | #endif |
73 | #ifndef REALLOC |
74 | #define REALLOC(p, n) realloc(VS(p), n) |
75 | #endif |
76 | #ifndef FREE |
77 | #define FREE(p) free(VS(p)) |
78 | #endif |
79 | |
80 | /* want size of a char in bits, and max value in bounded quantifiers */ |
81 | #ifndef _POSIX2_RE_DUP_MAX |
82 | #define _POSIX2_RE_DUP_MAX 255 /* normally from <limits.h> */ |
83 | #endif |
84 | |
85 | |
86 | |
87 | /* |
88 | * misc |
89 | */ |
90 | |
91 | #define NOTREACHED 0 |
92 | |
93 | #define DUPMAX _POSIX2_RE_DUP_MAX |
94 | #define DUPINF (DUPMAX+1) |
95 | |
96 | #define REMAGIC 0xfed7 /* magic number for main struct */ |
97 | |
98 | /* Type codes for lookaround constraints */ |
99 | #define LATYPE_AHEAD_POS 03 /* positive lookahead */ |
100 | #define LATYPE_AHEAD_NEG 02 /* negative lookahead */ |
101 | #define LATYPE_BEHIND_POS 01 /* positive lookbehind */ |
102 | #define LATYPE_BEHIND_NEG 00 /* negative lookbehind */ |
103 | #define LATYPE_IS_POS(la) ((la) & 01) |
104 | #define LATYPE_IS_AHEAD(la) ((la) & 02) |
105 | |
106 | |
107 | /* |
108 | * debugging facilities |
109 | */ |
110 | #ifdef REG_DEBUG |
111 | /* FDEBUG does finite-state tracing */ |
112 | #define FDEBUG(arglist) { if (v->eflags®_FTRACE) printf arglist; } |
113 | /* MDEBUG does higher-level tracing */ |
114 | #define MDEBUG(arglist) { if (v->eflags®_MTRACE) printf arglist; } |
115 | #else |
116 | #define FDEBUG(arglist) {} |
117 | #define MDEBUG(arglist) {} |
118 | #endif |
119 | |
120 | |
121 | |
122 | /* |
123 | * bitmap manipulation |
124 | */ |
125 | #define UBITS (CHAR_BIT * sizeof(unsigned)) |
126 | #define BSET(uv, sn) ((uv)[(sn)/UBITS] |= (unsigned)1 << ((sn)%UBITS)) |
127 | #define ISBSET(uv, sn) ((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS))) |
128 | |
129 | |
130 | /* |
131 | * As soon as possible, we map chrs into equivalence classes -- "colors" -- |
132 | * which are of much more manageable number. |
133 | */ |
134 | typedef short color; /* colors of characters */ |
135 | |
136 | #define MAX_COLOR 32767 /* max color (must fit in 'color' datatype) */ |
137 | #define COLORLESS (-1) /* impossible color */ |
138 | #define WHITE 0 /* default color, parent of all others */ |
139 | /* Note: various places in the code know that WHITE is zero */ |
140 | |
141 | |
142 | /* |
143 | * Per-color data structure for the compile-time color machinery |
144 | * |
145 | * If "sub" is not NOSUB then it is the number of the color's current |
146 | * subcolor, i.e. we are in process of dividing this color (character |
147 | * equivalence class) into two colors. See src/backend/regex/README for |
148 | * discussion of subcolors. |
149 | * |
150 | * Currently-unused colors have the FREECOL bit set and are linked into a |
151 | * freelist using their "sub" fields, but only if their color numbers are |
152 | * less than colormap.max. Any array entries beyond "max" are just garbage. |
153 | */ |
154 | struct colordesc |
155 | { |
156 | int nschrs; /* number of simple chars of this color */ |
157 | int nuchrs; /* number of upper map entries of this color */ |
158 | color sub; /* open subcolor, if any; or free-chain ptr */ |
159 | #define NOSUB COLORLESS /* value of "sub" when no open subcolor */ |
160 | struct arc *arcs; /* chain of all arcs of this color */ |
161 | chr firstchr; /* simple char first assigned to this color */ |
162 | int flags; /* bit values defined next */ |
163 | #define FREECOL 01 /* currently free */ |
164 | #define PSEUDO 02 /* pseudocolor, no real chars */ |
165 | #define UNUSEDCOLOR(cd) ((cd)->flags & FREECOL) |
166 | }; |
167 | |
168 | /* |
169 | * The color map itself |
170 | * |
171 | * This struct holds both data used only at compile time, and the chr to |
172 | * color mapping information, used at both compile and run time. The latter |
173 | * is the bulk of the space, so it's not really worth separating out the |
174 | * compile-only portion. |
175 | * |
176 | * Ideally, the mapping data would just be an array of colors indexed by |
177 | * chr codes; but for large character sets that's impractical. Fortunately, |
178 | * common characters have smaller codes, so we can use a simple array for chr |
179 | * codes up to MAX_SIMPLE_CHR, and do something more complex for codes above |
180 | * that, without much loss of performance. The "something more complex" is a |
181 | * 2-D array of color entries, where row indexes correspond to individual chrs |
182 | * or chr ranges that have been mentioned in the regex (with row zero |
183 | * representing all other chrs), and column indexes correspond to different |
184 | * sets of locale-dependent character classes such as "isalpha". The |
185 | * classbits[k] entry is zero if we do not care about the k'th character class |
186 | * in this regex, and otherwise it is the bit to be OR'd into the column index |
187 | * if the character in question is a member of that class. We find the color |
188 | * of a high-valued chr by identifying which colormaprange it is in to get |
189 | * the row index (use row zero if it's in none of them), identifying which of |
190 | * the interesting cclasses it's in to get the column index, and then indexing |
191 | * into the 2-D hicolormap array. |
192 | * |
193 | * The colormapranges are required to be nonempty, nonoverlapping, and to |
194 | * appear in increasing chr-value order. |
195 | */ |
196 | |
197 | #define NUM_CCLASSES 13 /* must match data in regc_locale.c */ |
198 | |
199 | typedef struct colormaprange |
200 | { |
201 | chr cmin; /* range represents cmin..cmax inclusive */ |
202 | chr cmax; |
203 | int rownum; /* row index in hicolormap array (>= 1) */ |
204 | } colormaprange; |
205 | |
206 | struct colormap |
207 | { |
208 | int magic; |
209 | #define CMMAGIC 0x876 |
210 | struct vars *v; /* for compile error reporting */ |
211 | size_t ncds; /* allocated length of colordescs array */ |
212 | size_t max; /* highest color number currently in use */ |
213 | color free; /* beginning of free chain (if non-0) */ |
214 | struct colordesc *cd; /* pointer to array of colordescs */ |
215 | #define CDEND(cm) (&(cm)->cd[(cm)->max + 1]) |
216 | |
217 | /* mapping data for chrs <= MAX_SIMPLE_CHR: */ |
218 | color *locolormap; /* simple array indexed by chr code */ |
219 | |
220 | /* mapping data for chrs > MAX_SIMPLE_CHR: */ |
221 | int classbits[NUM_CCLASSES]; /* see comment above */ |
222 | int numcmranges; /* number of colormapranges */ |
223 | colormaprange *cmranges; /* ranges of high chrs */ |
224 | color *hicolormap; /* 2-D array of color entries */ |
225 | int maxarrayrows; /* number of array rows allocated */ |
226 | int hiarrayrows; /* number of array rows in use */ |
227 | int hiarraycols; /* number of array columns (2^N) */ |
228 | |
229 | /* If we need up to NINLINECDS, we store them here to save a malloc */ |
230 | #define NINLINECDS ((size_t) 10) |
231 | struct colordesc cdspace[NINLINECDS]; |
232 | }; |
233 | |
234 | /* fetch color for chr; beware of multiple evaluation of c argument */ |
235 | #define GETCOLOR(cm, c) \ |
236 | ((c) <= MAX_SIMPLE_CHR ? (cm)->locolormap[(c) - CHR_MIN] : pg_reg_getcolor(cm, c)) |
237 | |
238 | |
239 | /* |
240 | * Interface definitions for locale-interface functions in regc_locale.c. |
241 | */ |
242 | |
243 | /* |
244 | * Representation of a set of characters. chrs[] represents individual |
245 | * code points, ranges[] represents ranges in the form min..max inclusive. |
246 | * |
247 | * If the cvec represents a locale-specific character class, eg [[:alpha:]], |
248 | * then the chrs[] and ranges[] arrays contain only members of that class |
249 | * up to MAX_SIMPLE_CHR (inclusive). cclasscode is set to regc_locale.c's |
250 | * code for the class, rather than being -1 as it is in an ordinary cvec. |
251 | * |
252 | * Note that in cvecs gotten from newcvec() and intended to be freed by |
253 | * freecvec(), both arrays of chrs are after the end of the struct, not |
254 | * separately malloc'd; so chrspace and rangespace are effectively immutable. |
255 | */ |
256 | struct cvec |
257 | { |
258 | int nchrs; /* number of chrs */ |
259 | int chrspace; /* number of chrs allocated in chrs[] */ |
260 | chr *chrs; /* pointer to vector of chrs */ |
261 | int nranges; /* number of ranges (chr pairs) */ |
262 | int rangespace; /* number of ranges allocated in ranges[] */ |
263 | chr *ranges; /* pointer to vector of chr pairs */ |
264 | int cclasscode; /* value of "enum classes", or -1 */ |
265 | }; |
266 | |
267 | |
268 | /* |
269 | * definitions for NFA internal representation |
270 | * |
271 | * Having a "from" pointer within each arc may seem redundant, but it |
272 | * saves a lot of hassle. |
273 | */ |
274 | struct state; |
275 | |
276 | struct arc |
277 | { |
278 | int type; /* 0 if free, else an NFA arc type code */ |
279 | color co; |
280 | struct state *from; /* where it's from (and contained within) */ |
281 | struct state *to; /* where it's to */ |
282 | struct arc *outchain; /* link in *from's outs chain or free chain */ |
283 | struct arc *outchainRev; /* back-link in *from's outs chain */ |
284 | #define freechain outchain /* we do not maintain "freechainRev" */ |
285 | struct arc *inchain; /* link in *to's ins chain */ |
286 | struct arc *inchainRev; /* back-link in *to's ins chain */ |
287 | struct arc *colorchain; /* link in color's arc chain */ |
288 | struct arc *colorchainRev; /* back-link in color's arc chain */ |
289 | }; |
290 | |
291 | struct arcbatch |
292 | { /* for bulk allocation of arcs */ |
293 | struct arcbatch *next; |
294 | #define ABSIZE 10 |
295 | struct arc a[ABSIZE]; |
296 | }; |
297 | |
298 | struct state |
299 | { |
300 | int no; |
301 | #define FREESTATE (-1) |
302 | char flag; /* marks special states */ |
303 | int nins; /* number of inarcs */ |
304 | struct arc *ins; /* chain of inarcs */ |
305 | int nouts; /* number of outarcs */ |
306 | struct arc *outs; /* chain of outarcs */ |
307 | struct arc *free; /* chain of free arcs */ |
308 | struct state *tmp; /* temporary for traversal algorithms */ |
309 | struct state *next; /* chain for traversing all */ |
310 | struct state *prev; /* back chain */ |
311 | struct arcbatch oas; /* first arcbatch, avoid malloc in easy case */ |
312 | int noas; /* number of arcs used in first arcbatch */ |
313 | }; |
314 | |
315 | struct nfa |
316 | { |
317 | struct state *pre; /* pre-initial state */ |
318 | struct state *init; /* initial state */ |
319 | struct state *final; /* final state */ |
320 | struct state *post; /* post-final state */ |
321 | int nstates; /* for numbering states */ |
322 | struct state *states; /* state-chain header */ |
323 | struct state *slast; /* tail of the chain */ |
324 | struct state *free; /* free list */ |
325 | struct colormap *cm; /* the color map */ |
326 | color bos[2]; /* colors, if any, assigned to BOS and BOL */ |
327 | color eos[2]; /* colors, if any, assigned to EOS and EOL */ |
328 | struct vars *v; /* simplifies compile error reporting */ |
329 | struct nfa *parent; /* parent NFA, if any */ |
330 | }; |
331 | |
332 | |
333 | |
334 | /* |
335 | * definitions for compacted NFA |
336 | * |
337 | * The main space savings in a compacted NFA is from making the arcs as small |
338 | * as possible. We store only the transition color and next-state number for |
339 | * each arc. The list of out arcs for each state is an array beginning at |
340 | * cnfa.states[statenumber], and terminated by a dummy carc struct with |
341 | * co == COLORLESS. |
342 | * |
343 | * The non-dummy carc structs are of two types: plain arcs and LACON arcs. |
344 | * Plain arcs just store the transition color number as "co". LACON arcs |
345 | * store the lookaround constraint number plus cnfa.ncolors as "co". LACON |
346 | * arcs can be distinguished from plain by testing for co >= cnfa.ncolors. |
347 | */ |
348 | struct carc |
349 | { |
350 | color co; /* COLORLESS is list terminator */ |
351 | int to; /* next-state number */ |
352 | }; |
353 | |
354 | struct cnfa |
355 | { |
356 | int nstates; /* number of states */ |
357 | int ncolors; /* number of colors (max color in use + 1) */ |
358 | int flags; |
359 | #define HASLACONS 01 /* uses lookaround constraints */ |
360 | int pre; /* setup state number */ |
361 | int post; /* teardown state number */ |
362 | color bos[2]; /* colors, if any, assigned to BOS and BOL */ |
363 | color eos[2]; /* colors, if any, assigned to EOS and EOL */ |
364 | char *stflags; /* vector of per-state flags bytes */ |
365 | #define CNFA_NOPROGRESS 01 /* flag bit for a no-progress state */ |
366 | struct carc **states; /* vector of pointers to outarc lists */ |
367 | /* states[n] are pointers into a single malloc'd array of arcs */ |
368 | struct carc *arcs; /* the area for the lists */ |
369 | }; |
370 | |
371 | #define ZAPCNFA(cnfa) ((cnfa).nstates = 0) |
372 | #define NULLCNFA(cnfa) ((cnfa).nstates == 0) |
373 | |
374 | /* |
375 | * This symbol limits the transient heap space used by the regex compiler, |
376 | * and thereby also the maximum complexity of NFAs that we'll deal with. |
377 | * Currently we only count NFA states and arcs against this; the other |
378 | * transient data is generally not large enough to notice compared to those. |
379 | * Note that we do not charge anything for the final output data structures |
380 | * (the compacted NFA and the colormap). |
381 | */ |
382 | #ifndef REG_MAX_COMPILE_SPACE |
383 | #define REG_MAX_COMPILE_SPACE \ |
384 | (100000 * sizeof(struct state) + 100000 * sizeof(struct arcbatch)) |
385 | #endif |
386 | |
387 | /* |
388 | * subexpression tree |
389 | * |
390 | * "op" is one of: |
391 | * '=' plain regex without interesting substructure (implemented as DFA) |
392 | * 'b' back-reference (has no substructure either) |
393 | * '(' capture node: captures the match of its single child |
394 | * '.' concatenation: matches a match for left, then a match for right |
395 | * '|' alternation: matches a match for left or a match for right |
396 | * '*' iteration: matches some number of matches of its single child |
397 | * |
398 | * Note: the right child of an alternation must be another alternation or |
399 | * NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you |
400 | * might expect. This could stand to be changed. Actually I'd rather see |
401 | * a single alternation node with N children, but that will take revising |
402 | * the representation of struct subre. |
403 | * |
404 | * Note: when a backref is directly quantified, we stick the min/max counts |
405 | * into the backref rather than plastering an iteration node on top. This is |
406 | * for efficiency: there is no need to search for possible division points. |
407 | */ |
408 | struct subre |
409 | { |
410 | char op; /* see type codes above */ |
411 | char flags; |
412 | #define LONGER 01 /* prefers longer match */ |
413 | #define SHORTER 02 /* prefers shorter match */ |
414 | #define MIXED 04 /* mixed preference below */ |
415 | #define CAP 010 /* capturing parens below */ |
416 | #define BACKR 020 /* back reference below */ |
417 | #define INUSE 0100 /* in use in final tree */ |
418 | #define NOPROP 03 /* bits which may not propagate up */ |
419 | #define LMIX(f) ((f)<<2) /* LONGER -> MIXED */ |
420 | #define SMIX(f) ((f)<<1) /* SHORTER -> MIXED */ |
421 | #define UP(f) (((f)&~NOPROP) | (LMIX(f) & SMIX(f) & MIXED)) |
422 | #define MESSY(f) ((f)&(MIXED|CAP|BACKR)) |
423 | #define PREF(f) ((f)&NOPROP) |
424 | #define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2)) |
425 | #define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2)) |
426 | short id; /* ID of subre (1..ntree-1) */ |
427 | int subno; /* subexpression number for 'b' and '(', or |
428 | * LATYPE code for lookaround constraint */ |
429 | short min; /* min repetitions for iteration or backref */ |
430 | short max; /* max repetitions for iteration or backref */ |
431 | struct subre *left; /* left child, if any (also freelist chain) */ |
432 | struct subre *right; /* right child, if any */ |
433 | struct state *begin; /* outarcs from here... */ |
434 | struct state *end; /* ...ending in inarcs here */ |
435 | struct cnfa cnfa; /* compacted NFA, if any */ |
436 | struct subre *chain; /* for bookkeeping and error cleanup */ |
437 | }; |
438 | |
439 | |
440 | |
441 | /* |
442 | * table of function pointers for generic manipulation functions |
443 | * A regex_t's re_fns points to one of these. |
444 | */ |
445 | struct fns |
446 | { |
447 | void FUNCPTR(free, (regex_t *)); |
448 | int FUNCPTR(cancel_requested, (void)); |
449 | int FUNCPTR(stack_too_deep, (void)); |
450 | }; |
451 | |
452 | #define CANCEL_REQUESTED(re) \ |
453 | ((*((struct fns *) (re)->re_fns)->cancel_requested) ()) |
454 | |
455 | #define STACK_TOO_DEEP(re) \ |
456 | ((*((struct fns *) (re)->re_fns)->stack_too_deep) ()) |
457 | |
458 | |
459 | /* |
460 | * the insides of a regex_t, hidden behind a void * |
461 | */ |
462 | struct guts |
463 | { |
464 | int magic; |
465 | #define GUTSMAGIC 0xfed9 |
466 | int cflags; /* copy of compile flags */ |
467 | long info; /* copy of re_info */ |
468 | size_t nsub; /* copy of re_nsub */ |
469 | struct subre *tree; |
470 | struct cnfa search; /* for fast preliminary search */ |
471 | int ntree; /* number of subre's, plus one */ |
472 | struct colormap cmap; |
473 | int FUNCPTR(compare, (const chr *, const chr *, size_t)); |
474 | struct subre *lacons; /* lookaround-constraint vector */ |
475 | int nlacons; /* size of lacons[]; note that only slots |
476 | * numbered 1 .. nlacons-1 are used */ |
477 | }; |
478 | |
479 | |
480 | /* prototypes for functions that are exported from regcomp.c to regexec.c */ |
481 | extern void pg_set_regex_collation(Oid collation); |
482 | extern color pg_reg_getcolor(struct colormap *cm, chr c); |
483 | |