| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * regexport.c |
| 4 | * Functions for exporting info about a regex's NFA |
| 5 | * |
| 6 | * In this implementation, the NFA defines a necessary but not sufficient |
| 7 | * condition for a string to match the regex: that is, there can be strings |
| 8 | * that match the NFA but don't match the full regex, but not vice versa. |
| 9 | * Thus, for example, it is okay for the functions below to treat lookaround |
| 10 | * constraints as no-ops, since they merely constrain the string some more. |
| 11 | * |
| 12 | * Notice that these functions return info into caller-provided arrays |
| 13 | * rather than doing their own malloc's. This simplifies the APIs by |
| 14 | * eliminating a class of error conditions, and in the case of colors |
| 15 | * allows the caller to decide how big is too big to bother with. |
| 16 | * |
| 17 | * |
| 18 | * Portions Copyright (c) 2013-2019, PostgreSQL Global Development Group |
| 19 | * Portions Copyright (c) 1998, 1999 Henry Spencer |
| 20 | * |
| 21 | * IDENTIFICATION |
| 22 | * src/backend/regex/regexport.c |
| 23 | * |
| 24 | *------------------------------------------------------------------------- |
| 25 | */ |
| 26 | |
| 27 | #include "regex/regguts.h" |
| 28 | |
| 29 | #include "regex/regexport.h" |
| 30 | |
| 31 | |
| 32 | /* |
| 33 | * Get total number of NFA states. |
| 34 | */ |
| 35 | int |
| 36 | pg_reg_getnumstates(const regex_t *regex) |
| 37 | { |
| 38 | struct cnfa *cnfa; |
| 39 | |
| 40 | assert(regex != NULL && regex->re_magic == REMAGIC); |
| 41 | cnfa = &((struct guts *) regex->re_guts)->search; |
| 42 | |
| 43 | return cnfa->nstates; |
| 44 | } |
| 45 | |
| 46 | /* |
| 47 | * Get initial state of NFA. |
| 48 | */ |
| 49 | int |
| 50 | pg_reg_getinitialstate(const regex_t *regex) |
| 51 | { |
| 52 | struct cnfa *cnfa; |
| 53 | |
| 54 | assert(regex != NULL && regex->re_magic == REMAGIC); |
| 55 | cnfa = &((struct guts *) regex->re_guts)->search; |
| 56 | |
| 57 | return cnfa->pre; |
| 58 | } |
| 59 | |
| 60 | /* |
| 61 | * Get final state of NFA. |
| 62 | */ |
| 63 | int |
| 64 | pg_reg_getfinalstate(const regex_t *regex) |
| 65 | { |
| 66 | struct cnfa *cnfa; |
| 67 | |
| 68 | assert(regex != NULL && regex->re_magic == REMAGIC); |
| 69 | cnfa = &((struct guts *) regex->re_guts)->search; |
| 70 | |
| 71 | return cnfa->post; |
| 72 | } |
| 73 | |
| 74 | /* |
| 75 | * pg_reg_getnumoutarcs() and pg_reg_getoutarcs() mask the existence of LACON |
| 76 | * arcs from the caller, treating any LACON as being automatically satisfied. |
| 77 | * Since the output representation does not support arcs that consume no |
| 78 | * character when traversed, we have to recursively traverse LACON arcs here, |
| 79 | * and report whatever normal arcs are reachable by traversing LACON arcs. |
| 80 | * Note that this wouldn't work if it were possible to reach the final state |
| 81 | * via LACON traversal, but the regex library never builds NFAs that have |
| 82 | * LACON arcs leading directly to the final state. (This is because the |
| 83 | * regex executor is designed to consume one character beyond the nominal |
| 84 | * match end --- possibly an EOS indicator --- so there is always a set of |
| 85 | * ordinary arcs leading to the final state.) |
| 86 | * |
| 87 | * traverse_lacons is a recursive subroutine used by both exported functions |
| 88 | * to count and then emit the reachable regular arcs. *arcs_count is |
| 89 | * incremented by the number of reachable arcs, and as many as will fit in |
| 90 | * arcs_len (possibly 0) are emitted into arcs[]. |
| 91 | */ |
| 92 | static void |
| 93 | traverse_lacons(struct cnfa *cnfa, int st, |
| 94 | int *arcs_count, |
| 95 | regex_arc_t *arcs, int arcs_len) |
| 96 | { |
| 97 | struct carc *ca; |
| 98 | |
| 99 | /* |
| 100 | * Since this function recurses, it could theoretically be driven to stack |
| 101 | * overflow. In practice, this is mostly useful to backstop against a |
| 102 | * failure of the regex compiler to remove a loop of LACON arcs. |
| 103 | */ |
| 104 | check_stack_depth(); |
| 105 | |
| 106 | for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) |
| 107 | { |
| 108 | if (ca->co < cnfa->ncolors) |
| 109 | { |
| 110 | /* Ordinary arc, so count and possibly emit it */ |
| 111 | int ndx = (*arcs_count)++; |
| 112 | |
| 113 | if (ndx < arcs_len) |
| 114 | { |
| 115 | arcs[ndx].co = ca->co; |
| 116 | arcs[ndx].to = ca->to; |
| 117 | } |
| 118 | } |
| 119 | else |
| 120 | { |
| 121 | /* LACON arc --- assume it's satisfied and recurse... */ |
| 122 | /* ... but first, assert it doesn't lead directly to post state */ |
| 123 | Assert(ca->to != cnfa->post); |
| 124 | |
| 125 | traverse_lacons(cnfa, ca->to, arcs_count, arcs, arcs_len); |
| 126 | } |
| 127 | } |
| 128 | } |
| 129 | |
| 130 | /* |
| 131 | * Get number of outgoing NFA arcs of state number "st". |
| 132 | */ |
| 133 | int |
| 134 | pg_reg_getnumoutarcs(const regex_t *regex, int st) |
| 135 | { |
| 136 | struct cnfa *cnfa; |
| 137 | int arcs_count; |
| 138 | |
| 139 | assert(regex != NULL && regex->re_magic == REMAGIC); |
| 140 | cnfa = &((struct guts *) regex->re_guts)->search; |
| 141 | |
| 142 | if (st < 0 || st >= cnfa->nstates) |
| 143 | return 0; |
| 144 | arcs_count = 0; |
| 145 | traverse_lacons(cnfa, st, &arcs_count, NULL, 0); |
| 146 | return arcs_count; |
| 147 | } |
| 148 | |
| 149 | /* |
| 150 | * Write array of outgoing NFA arcs of state number "st" into arcs[], |
| 151 | * whose length arcs_len must be at least as long as indicated by |
| 152 | * pg_reg_getnumoutarcs(), else not all arcs will be returned. |
| 153 | */ |
| 154 | void |
| 155 | pg_reg_getoutarcs(const regex_t *regex, int st, |
| 156 | regex_arc_t *arcs, int arcs_len) |
| 157 | { |
| 158 | struct cnfa *cnfa; |
| 159 | int arcs_count; |
| 160 | |
| 161 | assert(regex != NULL && regex->re_magic == REMAGIC); |
| 162 | cnfa = &((struct guts *) regex->re_guts)->search; |
| 163 | |
| 164 | if (st < 0 || st >= cnfa->nstates || arcs_len <= 0) |
| 165 | return; |
| 166 | arcs_count = 0; |
| 167 | traverse_lacons(cnfa, st, &arcs_count, arcs, arcs_len); |
| 168 | } |
| 169 | |
| 170 | /* |
| 171 | * Get total number of colors. |
| 172 | */ |
| 173 | int |
| 174 | pg_reg_getnumcolors(const regex_t *regex) |
| 175 | { |
| 176 | struct colormap *cm; |
| 177 | |
| 178 | assert(regex != NULL && regex->re_magic == REMAGIC); |
| 179 | cm = &((struct guts *) regex->re_guts)->cmap; |
| 180 | |
| 181 | return cm->max + 1; |
| 182 | } |
| 183 | |
| 184 | /* |
| 185 | * Check if color is beginning of line/string. |
| 186 | * |
| 187 | * (We might at some point need to offer more refined handling of pseudocolors, |
| 188 | * but this will do for now.) |
| 189 | */ |
| 190 | int |
| 191 | pg_reg_colorisbegin(const regex_t *regex, int co) |
| 192 | { |
| 193 | struct cnfa *cnfa; |
| 194 | |
| 195 | assert(regex != NULL && regex->re_magic == REMAGIC); |
| 196 | cnfa = &((struct guts *) regex->re_guts)->search; |
| 197 | |
| 198 | if (co == cnfa->bos[0] || co == cnfa->bos[1]) |
| 199 | return true; |
| 200 | else |
| 201 | return false; |
| 202 | } |
| 203 | |
| 204 | /* |
| 205 | * Check if color is end of line/string. |
| 206 | */ |
| 207 | int |
| 208 | pg_reg_colorisend(const regex_t *regex, int co) |
| 209 | { |
| 210 | struct cnfa *cnfa; |
| 211 | |
| 212 | assert(regex != NULL && regex->re_magic == REMAGIC); |
| 213 | cnfa = &((struct guts *) regex->re_guts)->search; |
| 214 | |
| 215 | if (co == cnfa->eos[0] || co == cnfa->eos[1]) |
| 216 | return true; |
| 217 | else |
| 218 | return false; |
| 219 | } |
| 220 | |
| 221 | /* |
| 222 | * Get number of member chrs of color number "co". |
| 223 | * |
| 224 | * Note: we return -1 if the color number is invalid, or if it is a special |
| 225 | * color (WHITE or a pseudocolor), or if the number of members is uncertain. |
| 226 | * Callers should not try to extract the members if -1 is returned. |
| 227 | */ |
| 228 | int |
| 229 | pg_reg_getnumcharacters(const regex_t *regex, int co) |
| 230 | { |
| 231 | struct colormap *cm; |
| 232 | |
| 233 | assert(regex != NULL && regex->re_magic == REMAGIC); |
| 234 | cm = &((struct guts *) regex->re_guts)->cmap; |
| 235 | |
| 236 | if (co <= 0 || co > cm->max) /* we reject 0 which is WHITE */ |
| 237 | return -1; |
| 238 | if (cm->cd[co].flags & PSEUDO) /* also pseudocolors (BOS etc) */ |
| 239 | return -1; |
| 240 | |
| 241 | /* |
| 242 | * If the color appears anywhere in the high colormap, treat its number of |
| 243 | * members as uncertain. In principle we could determine all the specific |
| 244 | * chrs corresponding to each such entry, but it would be expensive |
| 245 | * (particularly if character class tests are required) and it doesn't |
| 246 | * seem worth it. |
| 247 | */ |
| 248 | if (cm->cd[co].nuchrs != 0) |
| 249 | return -1; |
| 250 | |
| 251 | /* OK, return the known number of member chrs */ |
| 252 | return cm->cd[co].nschrs; |
| 253 | } |
| 254 | |
| 255 | /* |
| 256 | * Write array of member chrs of color number "co" into chars[], |
| 257 | * whose length chars_len must be at least as long as indicated by |
| 258 | * pg_reg_getnumcharacters(), else not all chars will be returned. |
| 259 | * |
| 260 | * Fetching the members of WHITE or a pseudocolor is not supported. |
| 261 | * |
| 262 | * Caution: this is a relatively expensive operation. |
| 263 | */ |
| 264 | void |
| 265 | pg_reg_getcharacters(const regex_t *regex, int co, |
| 266 | pg_wchar *chars, int chars_len) |
| 267 | { |
| 268 | struct colormap *cm; |
| 269 | chr c; |
| 270 | |
| 271 | assert(regex != NULL && regex->re_magic == REMAGIC); |
| 272 | cm = &((struct guts *) regex->re_guts)->cmap; |
| 273 | |
| 274 | if (co <= 0 || co > cm->max || chars_len <= 0) |
| 275 | return; |
| 276 | if (cm->cd[co].flags & PSEUDO) |
| 277 | return; |
| 278 | |
| 279 | /* |
| 280 | * We need only examine the low character map; there should not be any |
| 281 | * matching entries in the high map. |
| 282 | */ |
| 283 | for (c = CHR_MIN; c <= MAX_SIMPLE_CHR; c++) |
| 284 | { |
| 285 | if (cm->locolormap[c - CHR_MIN] == co) |
| 286 | { |
| 287 | *chars++ = c; |
| 288 | if (--chars_len == 0) |
| 289 | break; |
| 290 | } |
| 291 | } |
| 292 | } |
| 293 | |